"git@developer.sourcefind.cn:jerrrrry/infinicore.git" did not exist on "147a4ac7df309db846384f874d0e9754c3067700"
Unverified Commit 7444946d authored by Shijie's avatar Shijie Committed by GitHub
Browse files

[Paddle] Add nn layer (#361)



* Add nn.layer: softmax, attention, transformer
Signed-off-by: default avatarShijie Wang <jaywan@nvidia.com>

* code refactor
Signed-off-by: default avatarShijie Wang <jaywan@nvidia.com>

* code refactor
Signed-off-by: default avatarShijie Wang <jaywan@nvidia.com>

* update docs and set dropout=0.1
Signed-off-by: default avatarShijie Wang <jaywan@nvidia.com>

* Update transformer_engine/paddle/layer/attention.py
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

---------
Signed-off-by: default avatarShijie Wang <jaywan@nvidia.com>
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
Co-authored-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
parent e4f9e767
This diff is collapsed.
...@@ -46,7 +46,7 @@ from transformer_engine.paddle.constants import FP8FwdTensors ...@@ -46,7 +46,7 @@ from transformer_engine.paddle.constants import FP8FwdTensors
from transformer_engine.common.recipe import DelayedScaling from transformer_engine.common.recipe import DelayedScaling
np.random.seed(10) np.random.seed(10)
paddle.seed(10) paddle.seed(11)
GEMM_CASES = [(256, 256, 512), (32, 32, 32), (16384, 1024, 2816), (16384, 2816, 1024), GEMM_CASES = [(256, 256, 512), (32, 32, 32), (16384, 1024, 2816), (16384, 2816, 1024),
(16384, 1024, 1024)] (16384, 1024, 1024)]
is_fp8_supported, reason = is_fp8_available() is_fp8_supported, reason = is_fp8_available()
...@@ -400,7 +400,7 @@ class TestLayerNorm: ...@@ -400,7 +400,7 @@ class TestLayerNorm:
y_ref, mu_ref, rsigma_ref = self.calc_fwd_ref(x, eps, gamma, beta) y_ref, mu_ref, rsigma_ref = self.calc_fwd_ref(x, eps, gamma, beta)
assert_allclose(y, y_ref, rtol=1e-5, atol=1e-5) assert_allclose(y, y_ref, rtol=1e-4, atol=1e-4)
assert_allclose(mu, mu_ref, rtol=1e-3, atol=1e-3) assert_allclose(mu, mu_ref, rtol=1e-3, atol=1e-3)
assert_allclose(rsigma, rsigma_ref, rtol=5e-2, atol=5e-2) assert_allclose(rsigma, rsigma_ref, rtol=5e-2, atol=5e-2)
...@@ -725,10 +725,8 @@ class TestFusedAttn: ...@@ -725,10 +725,8 @@ class TestFusedAttn:
q_grad = dq q_grad = dq
k_grad = dkv[:, :, 0, :, :] k_grad = dkv[:, :, 0, :, :]
v_grad = dkv[:, :, 1, :, :] v_grad = dkv[:, :, 1, :, :]
fwd_out = paddle.reshape(
out, shape=[self.batch_size, self.q_seqlen, self.num_heads, self.head_size])
return fwd_out, q_grad, k_grad, v_grad return out, q_grad, k_grad, v_grad
@pytest.mark.skipif(paddle.device.cuda.get_device_capability() < (8, 0), @pytest.mark.skipif(paddle.device.cuda.get_device_capability() < (8, 0),
reason="cuDNN fMHA requires Ampere+ GPU") reason="cuDNN fMHA requires Ampere+ GPU")
......
...@@ -3,5 +3,6 @@ ...@@ -3,5 +3,6 @@
# See LICENSE for license information. # See LICENSE for license information.
"""Transformer Engine bindings for Paddle""" """Transformer Engine bindings for Paddle"""
from .layer import Linear, LayerNorm, LayerNormLinear, LayerNormMLP
from .fp8 import fp8_autocast from .fp8 import fp8_autocast
from .layer import (Linear, LayerNorm, LayerNormLinear, LayerNormMLP, FusedScaleMaskSoftmax,
DotProductAttention, MultiHeadAttention, TransformerLayer)
...@@ -40,3 +40,9 @@ TE_DType = { ...@@ -40,3 +40,9 @@ TE_DType = {
paddle.float16: tex.DType.kFloat16, paddle.float16: tex.DType.kFloat16,
paddle.bfloat16: tex.DType.kBFloat16, paddle.bfloat16: tex.DType.kBFloat16,
} }
AttnMaskTypes = ("causal", "padding", "no_mask")
AttnTypes = ("self", "cross")
LayerTypes = ("encoder", "decoder")
...@@ -435,9 +435,9 @@ def fused_attn_fwd_qkvpacked( ...@@ -435,9 +435,9 @@ def fused_attn_fwd_qkvpacked(
assert (Bias.dtype == qkv.dtype), "bias tensor must be in the same dtype as qkv." assert (Bias.dtype == qkv.dtype), "bias tensor must be in the same dtype as qkv."
if set_zero: if set_zero:
out = paddle.full(shape=[total_seqs, h, d], fill_value=0, dtype=qkv.dtype) out = paddle.full(shape=[b, max_seqlen, h, d], fill_value=0, dtype=qkv.dtype)
else: else:
out = paddle.empty(shape=[total_seqs, h, d], dtype=qkv.dtype) out = paddle.empty(shape=[b, max_seqlen, h, d], dtype=qkv.dtype)
if is_training: if is_training:
softmax_aux = paddle.empty(shape=[b, h, max_seqlen, max_seqlen], dtype=qkv.dtype) softmax_aux = paddle.empty(shape=[b, h, max_seqlen, max_seqlen], dtype=qkv.dtype)
...@@ -574,9 +574,9 @@ def fused_attn_fwd_kvpacked( ...@@ -574,9 +574,9 @@ def fused_attn_fwd_kvpacked(
assert (Bias.dtype == q.dtype), "bias tensor must be in the same dtype as q and kv." assert (Bias.dtype == q.dtype), "bias tensor must be in the same dtype as q and kv."
if set_zero: if set_zero:
out = paddle.full(shape=[total_seqs_q, h, d], fill_value=0, dtype=q.dtype) out = paddle.full(shape=[b, max_seqlen_q, h, d], fill_value=0, dtype=q.dtype)
else: else:
out = paddle.empty(shape=[total_seqs_q, h, d], dtype=q.dtype) out = paddle.empty(shape=[b, max_seqlen_q, h, d], dtype=q.dtype)
if is_training: if is_training:
softmax_aux = paddle.empty(shape=[b, h, max_seqlen_q, max_seqlen_kv], dtype=q.dtype) softmax_aux = paddle.empty(shape=[b, h, max_seqlen_q, max_seqlen_kv], dtype=q.dtype)
......
...@@ -3,7 +3,10 @@ ...@@ -3,7 +3,10 @@
# See LICENSE for license information. # See LICENSE for license information.
"""Layer level Paddle APIs""" """Layer level Paddle APIs"""
from .attention import DotProductAttention, MultiHeadAttention
from .layernorm import LayerNorm from .layernorm import LayerNorm
from .layernorm_linear import LayerNormLinear from .layernorm_linear import LayerNormLinear
from .layernorm_mlp import LayerNormMLP from .layernorm_mlp import LayerNormMLP
from .linear import Linear from .linear import Linear
from .softmax import FusedScaleMaskSoftmax
from .transformer import TransformerLayer
This diff is collapsed.
...@@ -126,7 +126,7 @@ class LayerNorm(paddle.nn.Layer): ...@@ -126,7 +126,7 @@ class LayerNorm(paddle.nn.Layer):
"Paddle backend does not support LayerNorm with zero-centered scale.") "Paddle backend does not support LayerNorm with zero-centered scale.")
return F.layer_norm(x=inp, return F.layer_norm(x=inp,
normalized_shape=inp.shape[1:], normalized_shape=inp.shape[-1],
weight=self.weight, weight=self.weight,
bias=self.bias, bias=self.bias,
epsilon=self.eps) epsilon=self.eps)
......
...@@ -402,7 +402,6 @@ class LayerNormLinear(TransformerEngineBaseLayer): ...@@ -402,7 +402,6 @@ class LayerNormLinear(TransformerEngineBaseLayer):
if self.return_layernorm_output: if self.return_layernorm_output:
out, ln_out = out out, ln_out = out
return out, ln_out return out, ln_out
return out return out
def _pd_forward( def _pd_forward(
...@@ -415,7 +414,7 @@ class LayerNormLinear(TransformerEngineBaseLayer): ...@@ -415,7 +414,7 @@ class LayerNormLinear(TransformerEngineBaseLayer):
"Paddle backend does not support LayerNorm with zero-centered scale.") "Paddle backend does not support LayerNorm with zero-centered scale.")
ln_out = F.layer_norm(x=inp, ln_out = F.layer_norm(x=inp,
normalized_shape=inp.shape[1:], normalized_shape=inp.shape[-1],
weight=self.ln_weight, weight=self.ln_weight,
bias=self.ln_bias, bias=self.ln_bias,
epsilon=self.eps) epsilon=self.eps)
......
...@@ -624,7 +624,7 @@ class LayerNormMLP(TransformerEngineBaseLayer): ...@@ -624,7 +624,7 @@ class LayerNormMLP(TransformerEngineBaseLayer):
"Paddle backend does not support LayerNorm with zero-centered scale.") "Paddle backend does not support LayerNorm with zero-centered scale.")
ln_out = F.layer_norm(x=inp, ln_out = F.layer_norm(x=inp,
normalized_shape=inp.shape[1:], normalized_shape=inp.shape[-1],
weight=self.ln_weight, weight=self.ln_weight,
bias=self.ln_bias, bias=self.ln_bias,
epsilon=self.eps) epsilon=self.eps)
......
# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
"""Fused scaled masked softmax functions"""
import os
import warnings
from typing import Callable, Tuple, Union, Optional
import paddle
from transformer_engine.paddle.cpp_extensions import (
scaled_upper_triang_masked_softmax_forward,
scaled_upper_triang_masked_softmax_backward,
scaled_masked_softmax_forward,
scaled_masked_softmax_backward,
scaled_softmax_forward,
scaled_softmax_backward,
)
THREADS_PER_WARP = 32
THREADS_PER_BLOCK = 128
_default_causal_mask = {}
def _get_default_causal_mask(seqlen: int) -> paddle.Tensor:
"""Return the causal upper triangular mask for softmax input"""
if seqlen not in _default_causal_mask:
_default_causal_mask[seqlen] = paddle.triu(paddle.ones((seqlen, seqlen)),
diagonal=1).cast('bool')
return _default_causal_mask[seqlen]
class ScaledUpperTriangMaskedSoftmax(paddle.autograd.PyLayer):
"""
Fused operation which performs following three operations in sequence
1. Scale the tensor.
2. Apply upper triangular mask (typically used in gpt models).
3. Perform softmax.
"""
@staticmethod
def forward(ctx, inputs: paddle.Tensor, scale: float) -> paddle.Tensor:
"""ScaledUpperTriangMaskedSoftmax fwd"""
scale_t = paddle.Tensor([scale])
softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0])
ctx.save_for_backward(softmax_results, scale_t)
return softmax_results
@staticmethod
def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]:
"""ScaledUpperTriangMaskedSoftmax bwd"""
softmax_results, scale_t = ctx.saved_tensor()
input_grads = scaled_upper_triang_masked_softmax_backward(output_grads, softmax_results,
scale_t[0])
return input_grads, None
class ScaledMaskedSoftmax(paddle.autograd.PyLayer):
"""
Fused operation which performs following three operations in sequence
1. Scale the tensor.
2. Apply the mask.
3. Perform softmax.
"""
@staticmethod
def forward(ctx, inputs: paddle.Tensor, mask: paddle.Tensor, scale: float) -> paddle.Tensor:
"""ScaledMaskedSoftmax fwd"""
scale_t = paddle.Tensor([scale])
softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0])
ctx.save_for_backward(softmax_results, scale_t)
return softmax_results
@staticmethod
def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]:
"""ScaledMaskedSoftmax bwd"""
softmax_results, scale_t = ctx.saved_tensor()
input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0])
return input_grads, None, None
class ScaledSoftmax(paddle.autograd.PyLayer):
"""
Fused operation which performs following two operations in sequence
1. Scale the tensor.
2. Perform softmax.
"""
@staticmethod
def forward(ctx, inputs: paddle.Tensor, scale: float) -> paddle.Tensor:
"""ScaledSoftmax fwd"""
scale_t = paddle.Tensor([scale])
softmax_results = scaled_softmax_forward(inputs, scale_t[0])
ctx.save_for_backward(softmax_results, scale_t)
return softmax_results
@staticmethod
def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]:
"""ScaledSoftmax bwd"""
softmax_results, scale_t = ctx.saved_tensor()
input_grads = scaled_softmax_backward(output_grads, softmax_results, scale_t[0])
return input_grads, None, None
class FusedScaleMaskSoftmax(paddle.nn.Layer):
"""
fused operation: scaling + mask + softmax
Arguments:
attn_mask_type: attention mask type (pad or causal)
mask_func: mask function to be applied.
softmax_in_fp32: if true, softmax in performed at fp32 precision.
"""
def __init__(
self,
attn_mask_type: str,
mask_func: Callable,
softmax_in_fp32: bool = True,
backend: str = 'transformer_engine',
) -> None:
super().__init__()
self.attn_mask_type = attn_mask_type
self.scaled_masked_softmax_fusion = bool(int(os.getenv("NVTE_MASKED_SOFTMAX_FUSION", "1")))
self.mask_func = mask_func
self.softmax_in_fp32 = softmax_in_fp32
self.backend = backend
def forward(
self,
inp: paddle.Tensor,
mask: paddle.Tensor,
scale: Optional[float] = None,
) -> paddle.Tensor:
"""FusedScaleMaskSoftmax fprop"""
# [batch_size, num_heads, s_q, s_kv]
assert inp.dim() == 4
self.input_is_fp16 = inp.dtype == paddle.float16
self.input_is_bf16 = inp.dtype == paddle.bfloat16
self.input_in_16bit_float = self.input_is_fp16 or self.input_is_bf16
assert (scale is None or self.softmax_in_fp32), "softmax should be in fp32 when scaled"
if self.backend == 'transformer_engine' and not self.is_kernel_available(*inp.shape):
warnings.warn(
"fused kernel is not available for this input shape, fall back to paddle backend")
self.backend = 'paddle'
if self.backend == 'transformer_engine':
return self._te_forward(inp, mask, scale)
if self.backend == 'paddle':
return self._pd_forward(inp, mask, scale)
raise AttributeError(f"Backend {self.backend} is not supported.")
def is_kernel_available(self, b: int, h: int, s_q: int, s_kv: int) -> bool:
"""Check FusedScaleMaskSoftmax kernel availability based on size"""
attn_batches = b * h
if (self.scaled_masked_softmax_fusion # user want to fuse
and self.input_in_16bit_float # input must be fp16
and 16 < s_kv <= 4096 # s_kv must be 16 ~ 2048
and s_q % 4 == 0 # s_q must be a multiple of 4
and attn_batches % 4 == 0 # b * h must be a multiple of 4
):
if 0 <= s_kv <= 4096:
batch_per_block = self.get_batch_per_block(int(s_kv))
if self.attn_mask_type == "causal":
if attn_batches % batch_per_block == 0:
return True
else:
if s_q % batch_per_block == 0:
return True
return False
def _te_forward(self,
inp: paddle.Tensor,
mask: paddle.Tensor,
scale: Optional[float] = None) -> paddle.Tensor:
"""Fused masked softmax kernel"""
b, h, s_q, s_kv = inp.size()
scale = 1.0 if scale is None else scale
if self.attn_mask_type == "causal":
assert s_q == s_kv, "causal mask is only for self attention"
# input is 3D tensor (attn_batches, s_q, s_kv)
inp = inp.reshape((-1, s_q, s_kv))
probs = ScaledUpperTriangMaskedSoftmax.apply(inp, scale)
return probs.reshape((b, h, s_q, s_kv))
# input is 4D tensor (b, h, s_q, s_kv)
if mask is not None:
return ScaledMaskedSoftmax.apply(inp, mask, scale)
return ScaledSoftmax.apply(inp, scale)
def _pd_forward(self,
inp: paddle.Tensor,
mask: paddle.Tensor,
scale: Optional[float] = None) -> paddle.Tensor:
"""Call Paddle OP"""
if self.input_in_16bit_float and self.softmax_in_fp32:
inp = paddle.cast(inp, 'float32')
if scale is not None:
inp = inp * scale
if self.attn_mask_type == "causal":
mask = _get_default_causal_mask(inp.shape[2])
mask_output = self.mask_func(inp, mask) if mask is not None else inp
probs = paddle.nn.functional.softmax(mask_output, axis=-1)
if self.input_in_16bit_float and self.softmax_in_fp32:
if self.input_is_fp16:
probs = paddle.cast(probs, 'float16')
else:
probs = paddle.cast(probs, 'bfloat16')
return probs
@staticmethod
def get_batch_per_block(key_seq_len: int) -> int:
"""Softmax utility"""
pow2 = 1 << (key_seq_len - 1).bit_length()
warp_size = pow2 if pow2 < THREADS_PER_WARP else THREADS_PER_WARP
batches_per_warp = 2 if pow2 <= 128 else 1
warps_per_block = THREADS_PER_BLOCK // warp_size
batches_per_block = warps_per_block * batches_per_warp
return batches_per_block
# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
"""Transformer"""
from typing import Optional, Union
import paddle
from transformer_engine.paddle.constants import (
AttnMaskTypes,
LayerTypes,
)
from transformer_engine.paddle.layer import (LayerNormMLP, LayerNorm, MultiHeadAttention)
from .base import TransformerEngineBaseLayer
class TransformerLayer(TransformerEngineBaseLayer):
r"""
TransformerLayer is made up of an attention block and a feedforward network (MLP).
This standard layer is based on the paper "Attention Is All You Need".
Parameters
----------
hidden_size : int
size of each input sample.
ffn_hidden_size : int
intermediate size to which input samples are projected.
num_attention_heads : int
number of attention heads in the transformer layer.
layernorm_epsilon : float, default = 1e-5
a value added to the denominator of layer normalization
for numerical stability.
hidden_dropout: float, default = 0.1
dropout probability for the dropout op after FC2 layer.
attention_dropout: float, default = 0.1
dropout probability for the dropout op during multi-head attention.
self_attn_mask_type: {'causal', 'padding'}, default = `causal`
type of attention mask passed into softmax operation.
apply_residual_connection_post_layernorm : bool, default = `False`
if set to `True`, residual connections are taken
from the output of layer norm (default is taken
from input of layer norm)
output_layernorm: bool, default = `False`
if set to `True`, layer normalization is applied on the output side,
after the final dropout-add. default behavior is to apply layer
normalization on the input side, before the QKV transformation.
layer_type: {'encoder', 'decoder'}, default = `encoder`
if set to `decoder`, an additional cross-attn block is added after self-attn.
This can be used for structures like `T5` Transformer in conjunction with the
`encoder` option.
zero_centered_gamma : bool, default = 'False'
if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
the LayerNorm formula changes to
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} *
(1 + \gamma) + \beta
activation : str, default = 'gelu'
Type of activation used in MLP block.
Options are: 'gelu', 'relu', 'reglu', 'geglu' and 'swiglu'.
params_dtype : paddle.dtype, default = `paddle.get_default_dtype()`
it controls the type used to allocate the initial parameters. Useful when
the model is trained with lower precision and the original FP32 parameters
would not fit in GPU memory.
"""
def __init__(self,
hidden_size: int,
ffn_hidden_size: int,
num_attention_heads: int,
layernorm_epsilon: float = 1e-5,
hidden_dropout: float = 0.1,
attention_dropout: float = 0.1,
weight_attr: Union[paddle.ParamAttr, None] = None,
bias_attr: Union[paddle.ParamAttr, None, bool] = None,
self_attn_mask_type: str = "causal",
params_dtype: Optional[paddle.dtype] = None,
apply_residual_connection_post_layernorm: bool = False,
output_layernorm: bool = False,
layer_type: str = "encoder",
zero_centered_gamma: bool = False,
activation: str = 'gelu',
backend: str = 'transformer_engine') -> None:
super().__init__()
params_dtype = paddle.get_default_dtype() if params_dtype is None else params_dtype
self.output_layernorm = output_layernorm
self.layer_type = layer_type
self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
self.self_attn_mask_type = self_attn_mask_type
assert (self_attn_mask_type
in AttnMaskTypes), f"self_attn_mask_type {self_attn_mask_type} not supported"
assert layer_type in LayerTypes, f"layer_type {layer_type} not supported"
attention_args = (
hidden_size,
num_attention_heads,
attention_dropout,
layernorm_epsilon,
weight_attr,
bias_attr,
)
common_attention_kwargs = {
"params_dtype": params_dtype,
"return_layernorm_output": apply_residual_connection_post_layernorm,
"zero_centered_gamma": zero_centered_gamma,
"backend": backend,
}
self.self_attention = MultiHeadAttention(
*attention_args,
**common_attention_kwargs,
attn_mask_type=self_attn_mask_type,
input_layernorm=not output_layernorm,
attention_type="self",
)
if layer_type == "decoder":
self.inter_attention = MultiHeadAttention(
*attention_args,
**common_attention_kwargs,
attn_mask_type="padding",
input_layernorm=True,
attention_type="cross",
)
self.layernorm_mlp = LayerNormMLP(
hidden_size,
ffn_hidden_size,
eps=layernorm_epsilon,
weight_attr=weight_attr,
bias_attr=bias_attr,
activation=activation,
return_layernorm_output=apply_residual_connection_post_layernorm,
zero_centered_gamma=zero_centered_gamma,
backend=backend,
)
self.hidden_dropout = hidden_dropout
if self.output_layernorm:
self.layernorm = LayerNorm(
hidden_size,
layernorm_epsilon,
weight_attr,
bias_attr,
zero_centered_gamma=zero_centered_gamma,
backend=backend,
)
def forward(
self,
hidden_states: paddle.Tensor,
attention_mask: Optional[paddle.Tensor] = None,
encoder_output: Optional[paddle.Tensor] = None,
enc_dec_attn_mask: Optional[paddle.Tensor] = None,
core_attention_bias_type: str = "no_bias",
core_attention_bias: Optional[paddle.Tensor] = None,
set_zero: bool = True,
) -> paddle.Tensor:
"""
Transformer Layer: attention block and a feedforward network (MLP)
.. note::
Argument :attr:`attention_mask` will be ignored when :attr:`self_attn_mask_type`
is set to `"causal"`.
Parameters
----------
hidden_states : paddle.Tensor
Input tensor.
attention_mask : Optional[paddle.Tensor], default = `None`
Boolean tensor used to mask out self-attention softmax input.
encoder_output : Optional[paddle.Tensor], default = `None`
Output of the encoder block to be fed into the decoder block if using
`layer_type="decoder"`.
enc_dec_attn_mask : Optional[paddle.Tensor], default = `None`
Boolean tensor used to mask out inter-attention softmax input if using
`layer_type="decoder"`.
core_attention_bias_type: str, default = `no_bias`
core_attention_bias: Optional[paddle.Tensor], default = `None`
Bias tensor for Q * K.T
set_zero: bool, default = `True`
Whether to set output tensors to 0 or not before use.
"""
if self.self_attn_mask_type != "causal" and attention_mask is not None:
assert (attention_mask.dtype == paddle.bool), "Attention mask must be a boolean tensor"
assert core_attention_bias_type in ['no_bias'], f"Only no_bias is supported currently, " \
f"but receive core_attention_bias_type = {core_attention_bias_type}"
# Self attention.
self_attention_outputs = self.self_attention(
hidden_states,
attention_mask,
core_attention_bias_type=core_attention_bias_type,
core_attention_bias=core_attention_bias,
set_zero=set_zero,
)
if self.apply_residual_connection_post_layernorm and not self.output_layernorm:
attention_output, residual = self_attention_outputs
else:
attention_output = self_attention_outputs
residual = hidden_states
# dropoout add.
out = paddle.nn.functional.dropout(
attention_output,
p=self.hidden_dropout,
training=True,
)
bda_output = residual + out
# Cross attention.
if self.layer_type == "decoder":
inter_attention_outputs = self.inter_attention(
bda_output,
enc_dec_attn_mask,
encoder_output=encoder_output,
core_attention_bias_type=core_attention_bias_type,
core_attention_bias=core_attention_bias,
set_zero=set_zero,
)
if self.apply_residual_connection_post_layernorm:
attention_output, residual = inter_attention_outputs
else:
attention_output = inter_attention_outputs
residual = bda_output
out = paddle.nn.functional.dropout(
attention_output,
p=self.hidden_dropout,
training=True,
)
bda_output = residual + out
# MLP.
mlp_outputs = self.layernorm_mlp(bda_output)
if self.apply_residual_connection_post_layernorm:
mlp_output, residual = mlp_outputs
else:
mlp_output = mlp_outputs
residual = bda_output
# dropoout add.
out = paddle.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=True)
output = residual + out
# For BERT like architectures.
if self.output_layernorm:
output = self.layernorm(output)
# output: [b, s, hidden]
return output
...@@ -52,3 +52,37 @@ def get_paddle_act_func(activation): ...@@ -52,3 +52,37 @@ def get_paddle_act_func(activation):
if activation not in funcs: if activation not in funcs:
raise "Activation type " + activation + " is not supported." raise "Activation type " + activation + " is not supported."
return funcs[activation] return funcs[activation]
def attention_mask_func(attention_scores: paddle.Tensor,
attention_mask: paddle.Tensor) -> paddle.Tensor:
"""Get attention mask"""
def _masked_fill(x, mask, value):
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)
attention_scores = _masked_fill(attention_scores, attention_mask, -10000.0)
return attention_scores
def mask_to_cu_seqlens(mask: paddle.Tensor, need_kv: bool = False) -> paddle.Tensor:
"""Convert mask to cu_seqlens"""
assert 'bool' in str(mask.dtype), "mask must be bool dtype"
assert len(mask.shape) == 4 and mask.shape[1] == 1, "mask must be [b, 1, s_q, s_kv]"
q_actual_seqlens = paddle.sum(mask[:, :, :, 0] == False, axis=(-1, -2), dtype='int32') # pylint: disable=singleton-comparison
q_cu_seqlens = paddle.cumsum(q_actual_seqlens)
q_cu_seqlens = paddle.concat([paddle.zeros([1], dtype=paddle.int32), q_cu_seqlens], axis=0)
if not need_kv:
return q_cu_seqlens, None
kv_actual_seqlens = paddle.sum(mask[:, :, 0, :] == False, axis=(-1, -2), dtype='int32') # pylint: disable=singleton-comparison
kv_cu_seqlens = paddle.cumsum(kv_actual_seqlens)
kv_cu_seqlens = paddle.concat([paddle.zeros([1], dtype=paddle.int32), kv_cu_seqlens], axis=0)
return q_cu_seqlens, kv_cu_seqlens
def divide(numerator: int, denominator: int) -> int:
"""Ensure that numerator is divisible by the denominator and return
the division value."""
assert (numerator % denominator == 0), f"{numerator} is not divisible by {denominator}"
return numerator // denominator
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment