[Paddle] Add nn layer (#361)

* Add nn.layer: softmax, attention, transformer Signed-off-by: Shijie Wang <jaywan@nvidia.com> * code refactor Signed-off-by: Shijie Wang <jaywan@nvidia.com> * code refactor Signed-off-by: Shijie Wang <jaywan@nvidia.com> * update docs and set dropout=0.1 Signed-off-by: Shijie Wang <jaywan@nvidia.com> * Update transformer_engine/paddle/layer/attention.py Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> --------- Signed-off-by: Shijie Wang <jaywan@nvidia.com> Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[Paddle] Add nn layer (#361)
* Add nn.layer: softmax, attention, transformer Signed-off-by: Shijie Wang <jaywan@nvidia.com> * code refactor Signed-off-by: Shijie Wang <jaywan@nvidia.com> * code refactor Signed-off-by: Shijie Wang <jaywan@nvidia.com> * update docs and set dropout=0.1 Signed-off-by: Shijie Wang <jaywan@nvidia.com> * Update transformer_engine/paddle/layer/attention.py Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> --------- Signed-off-by: Shijie Wang <jaywan@nvidia.com> Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
7444946d · Shijie · GitHub · e4f9e767 · 7444946d · 7444946d
Unverified Commit 7444946d authored Aug 18, 2023 by Shijie Committed by GitHub Aug 17, 2023
13 changed files
--- a/tests/paddle/test_layers.py
+++ b/tests/paddle/test_layers.py
--- a/tests/paddle/test_operators.py
+++ b/tests/paddle/test_operators.py
@@ -46,7 +46,7 @@ from transformer_engine.paddle.constants import FP8FwdTensors
 from transformer_engine.common.recipe import DelayedScaling
 np.random.seed(10)
-paddle.seed(10)
+paddle.seed(11)
 GEMM_CASES = [(256, 256, 512), (32, 32, 32), (16384, 1024, 2816), (16384, 2816, 1024),
              (16384, 1024, 1024)]
 is_fp8_supported, reason = is_fp8_available()
@@ -400,7 +400,7 @@ class TestLayerNorm:
        y_ref, mu_ref, rsigma_ref = self.calc_fwd_ref(x, eps, gamma, beta)
-        assert_allclose(y, y_ref, rtol=1e-5, atol=1e-5)
+        assert_allclose(y, y_ref, rtol=1e-4, atol=1e-4)
        assert_allclose(mu, mu_ref, rtol=1e-3, atol=1e-3)
        assert_allclose(rsigma, rsigma_ref, rtol=5e-2, atol=5e-2)
@@ -725,10 +725,8 @@ class TestFusedAttn:
            q_grad = dq
            k_grad = dkv[:, :, 0, :, :]
            v_grad = dkv[:, :, 1, :, :]
-        fwd_out = paddle.reshape(
-            out, shape=[self.batch_size, self.q_seqlen, self.num_heads, self.head_size])
-        return fwd_out, q_grad, k_grad, v_grad
+        return out, q_grad, k_grad, v_grad
    @pytest.mark.skipif(paddle.device.cuda.get_device_capability() < (8, 0),
                        reason="cuDNN fMHA requires Ampere+ GPU")

--- a/transformer_engine/paddle/__init__.py
+++ b/transformer_engine/paddle/__init__.py
@@ -3,5 +3,6 @@
 # See LICENSE for license information.
 """Transformer Engine bindings for Paddle"""
-from .layer import Linear, LayerNorm, LayerNormLinear, LayerNormMLP
 from .fp8 import fp8_autocast
+from .layer import (Linear, LayerNorm, LayerNormLinear, LayerNormMLP, FusedScaleMaskSoftmax,
+                    DotProductAttention, MultiHeadAttention, TransformerLayer)
--- a/transformer_engine/paddle/constants.py
+++ b/transformer_engine/paddle/constants.py
@@ -40,3 +40,9 @@ TE_DType = {
    paddle.float16: tex.DType.kFloat16,
    paddle.bfloat16: tex.DType.kBFloat16,
 }
+AttnMaskTypes = ("causal", "padding", "no_mask")
+AttnTypes = ("self", "cross")
+LayerTypes = ("encoder", "decoder")
--- a/transformer_engine/paddle/cpp_extensions.py
+++ b/transformer_engine/paddle/cpp_extensions.py
@@ -435,9 +435,9 @@ def fused_attn_fwd_qkvpacked(
        assert (Bias.dtype == qkv.dtype), "bias tensor must be in the same dtype as qkv."
    if set_zero:
-        out = paddle.full(shape=[total_seqs, h, d], fill_value=0, dtype=qkv.dtype)
+        out = paddle.full(shape=[b, max_seqlen, h, d], fill_value=0, dtype=qkv.dtype)
    else:
-        out = paddle.empty(shape=[total_seqs, h, d], dtype=qkv.dtype)
+        out = paddle.empty(shape=[b, max_seqlen, h, d], dtype=qkv.dtype)
    if is_training:
        softmax_aux = paddle.empty(shape=[b, h, max_seqlen, max_seqlen], dtype=qkv.dtype)
@@ -574,9 +574,9 @@ def fused_attn_fwd_kvpacked(
        assert (Bias.dtype == q.dtype), "bias tensor must be in the same dtype as q and kv."
    if set_zero:
-        out = paddle.full(shape=[total_seqs_q, h, d], fill_value=0, dtype=q.dtype)
+        out = paddle.full(shape=[b, max_seqlen_q, h, d], fill_value=0, dtype=q.dtype)
    else:
-        out = paddle.empty(shape=[total_seqs_q, h, d], dtype=q.dtype)
+        out = paddle.empty(shape=[b, max_seqlen_q, h, d], dtype=q.dtype)
    if is_training:
        softmax_aux = paddle.empty(shape=[b, h, max_seqlen_q, max_seqlen_kv], dtype=q.dtype)

--- a/transformer_engine/paddle/layer/__init__.py
+++ b/transformer_engine/paddle/layer/__init__.py
@@ -3,7 +3,10 @@
 # See LICENSE for license information.
 """Layer level Paddle APIs"""
+from .attention import DotProductAttention, MultiHeadAttention
 from .layernorm import LayerNorm
 from .layernorm_linear import LayerNormLinear
 from .layernorm_mlp import LayerNormMLP
 from .linear import Linear
+from .softmax import FusedScaleMaskSoftmax
+from .transformer import TransformerLayer
--- a/transformer_engine/paddle/layer/attention.py
+++ b/transformer_engine/paddle/layer/attention.py
--- a/transformer_engine/paddle/layer/layernorm.py
+++ b/transformer_engine/paddle/layer/layernorm.py
@@ -126,7 +126,7 @@ class LayerNorm(paddle.nn.Layer):
                "Paddle backend does not support LayerNorm with zero-centered scale.")
        return F.layer_norm(x=inp,
-                            normalized_shape=inp.shape[1:],
+                            normalized_shape=inp.shape[-1],
                            weight=self.weight,
                            bias=self.bias,
                            epsilon=self.eps)

--- a/transformer_engine/paddle/layer/layernorm_linear.py
+++ b/transformer_engine/paddle/layer/layernorm_linear.py
@@ -402,7 +402,6 @@ class LayerNormLinear(TransformerEngineBaseLayer):
        if self.return_layernorm_output:
            out, ln_out = out
            return out, ln_out
        return out
    def _pd_forward(
@@ -415,7 +414,7 @@ class LayerNormLinear(TransformerEngineBaseLayer):
                "Paddle backend does not support LayerNorm with zero-centered scale.")
        ln_out = F.layer_norm(x=inp,
-                              normalized_shape=inp.shape[1:],
+                              normalized_shape=inp.shape[-1],
                              weight=self.ln_weight,
                              bias=self.ln_bias,
                              epsilon=self.eps)

--- a/transformer_engine/paddle/layer/layernorm_mlp.py
+++ b/transformer_engine/paddle/layer/layernorm_mlp.py
@@ -624,7 +624,7 @@ class LayerNormMLP(TransformerEngineBaseLayer):
                "Paddle backend does not support LayerNorm with zero-centered scale.")
        ln_out = F.layer_norm(x=inp,
-                              normalized_shape=inp.shape[1:],
+                              normalized_shape=inp.shape[-1],
                              weight=self.ln_weight,
                              bias=self.ln_bias,
                              epsilon=self.eps)

--- a/transformer_engine/paddle/layer/softmax.py
+++ b/transformer_engine/paddle/layer/softmax.py
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+"""Fused scaled masked softmax functions"""
+import os
+import warnings
+from typing import Callable, Tuple, Union, Optional
+import paddle
+from transformer_engine.paddle.cpp_extensions import (
+    scaled_upper_triang_masked_softmax_forward,
+    scaled_upper_triang_masked_softmax_backward,
+    scaled_masked_softmax_forward,
+    scaled_masked_softmax_backward,
+    scaled_softmax_forward,
+    scaled_softmax_backward,
+)
+THREADS_PER_WARP = 32
+THREADS_PER_BLOCK = 128
+_default_causal_mask = {}
+def _get_default_causal_mask(seqlen: int) -> paddle.Tensor:
+    """Return the causal upper triangular mask for softmax input"""
+    if seqlen not in _default_causal_mask:
+        _default_causal_mask[seqlen] = paddle.triu(paddle.ones((seqlen, seqlen)),
+                                                   diagonal=1).cast('bool')
+    return _default_causal_mask[seqlen]
+class ScaledUpperTriangMaskedSoftmax(paddle.autograd.PyLayer):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs: paddle.Tensor, scale: float) -> paddle.Tensor:
+        """ScaledUpperTriangMaskedSoftmax fwd"""
+        scale_t = paddle.Tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]:
+        """ScaledUpperTriangMaskedSoftmax bwd"""
+        softmax_results, scale_t = ctx.saved_tensor()
+        input_grads = scaled_upper_triang_masked_softmax_backward(output_grads, softmax_results,
+                                                                  scale_t[0])
+        return input_grads, None
+class ScaledMaskedSoftmax(paddle.autograd.PyLayer):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs: paddle.Tensor, mask: paddle.Tensor, scale: float) -> paddle.Tensor:
+        """ScaledMaskedSoftmax fwd"""
+        scale_t = paddle.Tensor([scale])
+        softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]:
+        """ScaledMaskedSoftmax bwd"""
+        softmax_results, scale_t = ctx.saved_tensor()
+        input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+class ScaledSoftmax(paddle.autograd.PyLayer):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs: paddle.Tensor, scale: float) -> paddle.Tensor:
+        """ScaledSoftmax fwd"""
+        scale_t = paddle.Tensor([scale])
+        softmax_results = scaled_softmax_forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None], ...]:
+        """ScaledSoftmax bwd"""
+        softmax_results, scale_t = ctx.saved_tensor()
+        input_grads = scaled_softmax_backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+class FusedScaleMaskSoftmax(paddle.nn.Layer):
+    """
+    fused operation: scaling + mask + softmax
+    Arguments:
+        attn_mask_type: attention mask type (pad or causal)
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+    """
+    def __init__(
+        self,
+        attn_mask_type: str,
+        mask_func: Callable,
+        softmax_in_fp32: bool = True,
+        backend: str = 'transformer_engine',
+    ) -> None:
+        super().__init__()
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = bool(int(os.getenv("NVTE_MASKED_SOFTMAX_FUSION", "1")))
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.backend = backend
+    def forward(
+        self,
+        inp: paddle.Tensor,
+        mask: paddle.Tensor,
+        scale: Optional[float] = None,
+    ) -> paddle.Tensor:
+        """FusedScaleMaskSoftmax fprop"""
+        # [batch_size, num_heads, s_q, s_kv]
+        assert inp.dim() == 4
+        self.input_is_fp16 = inp.dtype == paddle.float16
+        self.input_is_bf16 = inp.dtype == paddle.bfloat16
+        self.input_in_16bit_float = self.input_is_fp16 or self.input_is_bf16
+        assert (scale is None or self.softmax_in_fp32), "softmax should be in fp32 when scaled"
+        if self.backend == 'transformer_engine' and not self.is_kernel_available(*inp.shape):
+            warnings.warn(
+                "fused kernel is not available for this input shape, fall back to paddle backend")
+            self.backend = 'paddle'
+        if self.backend == 'transformer_engine':
+            return self._te_forward(inp, mask, scale)
+        if self.backend == 'paddle':
+            return self._pd_forward(inp, mask, scale)
+        raise AttributeError(f"Backend {self.backend} is not supported.")
+    def is_kernel_available(self, b: int, h: int, s_q: int, s_kv: int) -> bool:
+        """Check FusedScaleMaskSoftmax kernel availability based on size"""
+        attn_batches = b * h
+        if (self.scaled_masked_softmax_fusion    # user want to fuse
+                and self.input_in_16bit_float    # input must be fp16
+                and 16 < s_kv <= 4096    # s_kv must be 16 ~ 2048
+                and s_q % 4 == 0    # s_q must be a multiple of 4
+                and attn_batches % 4 == 0    # b * h must be a multiple of 4
+           ):
+            if 0 <= s_kv <= 4096:
+                batch_per_block = self.get_batch_per_block(int(s_kv))
+                if self.attn_mask_type == "causal":
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if s_q % batch_per_block == 0:
+                        return True
+        return False
+    def _te_forward(self,
+                    inp: paddle.Tensor,
+                    mask: paddle.Tensor,
+                    scale: Optional[float] = None) -> paddle.Tensor:
+        """Fused masked softmax kernel"""
+        b, h, s_q, s_kv = inp.size()
+        scale = 1.0 if scale is None else scale
+        if self.attn_mask_type == "causal":
+            assert s_q == s_kv, "causal mask is only for self attention"
+            # input is 3D tensor (attn_batches, s_q, s_kv)
+            inp = inp.reshape((-1, s_q, s_kv))
+            probs = ScaledUpperTriangMaskedSoftmax.apply(inp, scale)
+            return probs.reshape((b, h, s_q, s_kv))
+        # input is 4D tensor (b, h, s_q, s_kv)
+        if mask is not None:
+            return ScaledMaskedSoftmax.apply(inp, mask, scale)
+        return ScaledSoftmax.apply(inp, scale)
+    def _pd_forward(self,
+                    inp: paddle.Tensor,
+                    mask: paddle.Tensor,
+                    scale: Optional[float] = None) -> paddle.Tensor:
+        """Call Paddle OP"""
+        if self.input_in_16bit_float and self.softmax_in_fp32:
+            inp = paddle.cast(inp, 'float32')
+        if scale is not None:
+            inp = inp * scale
+        if self.attn_mask_type == "causal":
+            mask = _get_default_causal_mask(inp.shape[2])
+        mask_output = self.mask_func(inp, mask) if mask is not None else inp
+        probs = paddle.nn.functional.softmax(mask_output, axis=-1)
+        if self.input_in_16bit_float and self.softmax_in_fp32:
+            if self.input_is_fp16:
+                probs = paddle.cast(probs, 'float16')
+            else:
+                probs = paddle.cast(probs, 'bfloat16')
+        return probs
+    @staticmethod
+    def get_batch_per_block(key_seq_len: int) -> int:
+        """Softmax utility"""
+        pow2 = 1 << (key_seq_len - 1).bit_length()
+        warp_size = pow2 if pow2 < THREADS_PER_WARP else THREADS_PER_WARP
+        batches_per_warp = 2 if pow2 <= 128 else 1
+        warps_per_block = THREADS_PER_BLOCK // warp_size
+        batches_per_block = warps_per_block * batches_per_warp
+        return batches_per_block
--- a/transformer_engine/paddle/layer/transformer.py
+++ b/transformer_engine/paddle/layer/transformer.py
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+"""Transformer"""
+from typing import Optional, Union
+import paddle
+from transformer_engine.paddle.constants import (
+    AttnMaskTypes,
+    LayerTypes,
+)
+from transformer_engine.paddle.layer import (LayerNormMLP, LayerNorm, MultiHeadAttention)
+from .base import TransformerEngineBaseLayer
+class TransformerLayer(TransformerEngineBaseLayer):
+    r"""
+    TransformerLayer is made up of an attention block and a feedforward network (MLP).
+    This standard layer is based on the paper "Attention Is All You Need".
+    Parameters
+    ----------
+    hidden_size : int
+                 size of each input sample.
+    ffn_hidden_size : int
+                     intermediate size to which input samples are projected.
+    num_attention_heads : int
+                         number of attention heads in the transformer layer.
+    layernorm_epsilon : float, default = 1e-5
+                       a value added to the denominator of layer normalization
+                       for numerical stability.
+    hidden_dropout: float, default = 0.1
+                   dropout probability for the dropout op after FC2 layer.
+    attention_dropout: float, default = 0.1
+                      dropout probability for the dropout op during multi-head attention.
+    self_attn_mask_type: {'causal', 'padding'}, default = `causal`
+                        type of attention mask passed into softmax operation.
+    apply_residual_connection_post_layernorm : bool, default = `False`
+                                              if set to `True`, residual connections are taken
+                                              from the output of layer norm (default is taken
+                                              from input of layer norm)
+    output_layernorm: bool, default = `False`
+                     if set to `True`, layer normalization is applied on the output side,
+                     after the final dropout-add. default behavior is to apply layer
+                     normalization on the input side, before the QKV transformation.
+    layer_type: {'encoder', 'decoder'}, default = `encoder`
+               if set to `decoder`, an additional cross-attn block is added after self-attn.
+               This can be used for structures like `T5` Transformer in conjunction with the
+               `encoder` option.
+    zero_centered_gamma : bool, default = 'False'
+                         if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
+                         the LayerNorm formula changes to
+                         .. math::
+                            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} *
+                            (1 + \gamma) + \beta
+    activation : str, default = 'gelu'
+          Type of activation used in MLP block.
+          Options are: 'gelu', 'relu', 'reglu', 'geglu' and 'swiglu'.
+    params_dtype : paddle.dtype, default = `paddle.get_default_dtype()`
+                  it controls the type used to allocate the initial parameters. Useful when
+                  the model is trained with lower precision and the original FP32 parameters
+                  would not fit in GPU memory.
+    """
+    def __init__(self,
+                 hidden_size: int,
+                 ffn_hidden_size: int,
+                 num_attention_heads: int,
+                 layernorm_epsilon: float = 1e-5,
+                 hidden_dropout: float = 0.1,
+                 attention_dropout: float = 0.1,
+                 weight_attr: Union[paddle.ParamAttr, None] = None,
+                 bias_attr: Union[paddle.ParamAttr, None, bool] = None,
+                 self_attn_mask_type: str = "causal",
+                 params_dtype: Optional[paddle.dtype] = None,
+                 apply_residual_connection_post_layernorm: bool = False,
+                 output_layernorm: bool = False,
+                 layer_type: str = "encoder",
+                 zero_centered_gamma: bool = False,
+                 activation: str = 'gelu',
+                 backend: str = 'transformer_engine') -> None:
+        super().__init__()
+        params_dtype = paddle.get_default_dtype() if params_dtype is None else params_dtype
+        self.output_layernorm = output_layernorm
+        self.layer_type = layer_type
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.self_attn_mask_type = self_attn_mask_type
+        assert (self_attn_mask_type
+                in AttnMaskTypes), f"self_attn_mask_type {self_attn_mask_type} not supported"
+        assert layer_type in LayerTypes, f"layer_type {layer_type} not supported"
+        attention_args = (
+            hidden_size,
+            num_attention_heads,
+            attention_dropout,
+            layernorm_epsilon,
+            weight_attr,
+            bias_attr,
+        )
+        common_attention_kwargs = {
+            "params_dtype": params_dtype,
+            "return_layernorm_output": apply_residual_connection_post_layernorm,
+            "zero_centered_gamma": zero_centered_gamma,
+            "backend": backend,
+        }
+        self.self_attention = MultiHeadAttention(
+            *attention_args,
+            **common_attention_kwargs,
+            attn_mask_type=self_attn_mask_type,
+            input_layernorm=not output_layernorm,
+            attention_type="self",
+        )
+        if layer_type == "decoder":
+            self.inter_attention = MultiHeadAttention(
+                *attention_args,
+                **common_attention_kwargs,
+                attn_mask_type="padding",
+                input_layernorm=True,
+                attention_type="cross",
+            )
+        self.layernorm_mlp = LayerNormMLP(
+            hidden_size,
+            ffn_hidden_size,
+            eps=layernorm_epsilon,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            activation=activation,
+            return_layernorm_output=apply_residual_connection_post_layernorm,
+            zero_centered_gamma=zero_centered_gamma,
+            backend=backend,
+        )
+        self.hidden_dropout = hidden_dropout
+        if self.output_layernorm:
+            self.layernorm = LayerNorm(
+                hidden_size,
+                layernorm_epsilon,
+                weight_attr,
+                bias_attr,
+                zero_centered_gamma=zero_centered_gamma,
+                backend=backend,
+            )
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_output: Optional[paddle.Tensor] = None,
+        enc_dec_attn_mask: Optional[paddle.Tensor] = None,
+        core_attention_bias_type: str = "no_bias",
+        core_attention_bias: Optional[paddle.Tensor] = None,
+        set_zero: bool = True,
+    ) -> paddle.Tensor:
+        """
+        Transformer Layer: attention block and a feedforward network (MLP)
+        .. note::
+            Argument :attr:`attention_mask` will be ignored when :attr:`self_attn_mask_type`
+            is set to `"causal"`.
+        Parameters
+        ----------
+        hidden_states : paddle.Tensor
+             Input tensor.
+        attention_mask : Optional[paddle.Tensor], default = `None`
+             Boolean tensor used to mask out self-attention softmax input.
+        encoder_output : Optional[paddle.Tensor], default = `None`
+             Output of the encoder block to be fed into the decoder block if using
+             `layer_type="decoder"`.
+        enc_dec_attn_mask : Optional[paddle.Tensor], default = `None`
+             Boolean tensor used to mask out inter-attention softmax input if using
+             `layer_type="decoder"`.
+        core_attention_bias_type: str, default = `no_bias`
+        core_attention_bias: Optional[paddle.Tensor], default = `None`
+                    Bias tensor for Q * K.T
+        set_zero: bool, default = `True`
+                    Whether to set output tensors to 0 or not before use.
+        """
+        if self.self_attn_mask_type != "causal" and attention_mask is not None:
+            assert (attention_mask.dtype == paddle.bool), "Attention mask must be a boolean tensor"
+        assert core_attention_bias_type in ['no_bias'], f"Only no_bias is supported currently, " \
+            f"but receive core_attention_bias_type = {core_attention_bias_type}"
+        # Self attention.
+        self_attention_outputs = self.self_attention(
+            hidden_states,
+            attention_mask,
+            core_attention_bias_type=core_attention_bias_type,
+            core_attention_bias=core_attention_bias,
+            set_zero=set_zero,
+        )
+        if self.apply_residual_connection_post_layernorm and not self.output_layernorm:
+            attention_output, residual = self_attention_outputs
+        else:
+            attention_output = self_attention_outputs
+            residual = hidden_states
+        # dropoout add.
+        out = paddle.nn.functional.dropout(
+            attention_output,
+            p=self.hidden_dropout,
+            training=True,
+        )
+        bda_output = residual + out
+        # Cross attention.
+        if self.layer_type == "decoder":
+            inter_attention_outputs = self.inter_attention(
+                bda_output,
+                enc_dec_attn_mask,
+                encoder_output=encoder_output,
+                core_attention_bias_type=core_attention_bias_type,
+                core_attention_bias=core_attention_bias,
+                set_zero=set_zero,
+            )
+            if self.apply_residual_connection_post_layernorm:
+                attention_output, residual = inter_attention_outputs
+            else:
+                attention_output = inter_attention_outputs
+                residual = bda_output
+            out = paddle.nn.functional.dropout(
+                attention_output,
+                p=self.hidden_dropout,
+                training=True,
+            )
+            bda_output = residual + out
+        # MLP.
+        mlp_outputs = self.layernorm_mlp(bda_output)
+        if self.apply_residual_connection_post_layernorm:
+            mlp_output, residual = mlp_outputs
+        else:
+            mlp_output = mlp_outputs
+            residual = bda_output
+        # dropoout add.
+        out = paddle.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=True)
+        output = residual + out
+        # For BERT like architectures.
+        if self.output_layernorm:
+            output = self.layernorm(output)
+        # output: [b, s, hidden]
+        return output
--- a/transformer_engine/paddle/utils.py
+++ b/transformer_engine/paddle/utils.py
@@ -52,3 +52,37 @@ def get_paddle_act_func(activation):
    if activation not in funcs:
        raise "Activation type " + activation + " is not supported."
    return funcs[activation]
+def attention_mask_func(attention_scores: paddle.Tensor,
+                        attention_mask: paddle.Tensor) -> paddle.Tensor:
+    """Get attention mask"""
+    def _masked_fill(x, mask, value):
+        y = paddle.full(x.shape, value, x.dtype)
+        return paddle.where(mask, y, x)
+    attention_scores = _masked_fill(attention_scores, attention_mask, -10000.0)
+    return attention_scores
+def mask_to_cu_seqlens(mask: paddle.Tensor, need_kv: bool = False) -> paddle.Tensor:
+    """Convert mask to cu_seqlens"""
+    assert 'bool' in str(mask.dtype), "mask must be bool dtype"
+    assert len(mask.shape) == 4 and mask.shape[1] == 1, "mask must be [b, 1, s_q, s_kv]"
+    q_actual_seqlens = paddle.sum(mask[:, :, :, 0] == False, axis=(-1, -2), dtype='int32')    # pylint: disable=singleton-comparison
+    q_cu_seqlens = paddle.cumsum(q_actual_seqlens)
+    q_cu_seqlens = paddle.concat([paddle.zeros([1], dtype=paddle.int32), q_cu_seqlens], axis=0)
+    if not need_kv:
+        return q_cu_seqlens, None
+    kv_actual_seqlens = paddle.sum(mask[:, :, 0, :] == False, axis=(-1, -2), dtype='int32')    # pylint: disable=singleton-comparison
+    kv_cu_seqlens = paddle.cumsum(kv_actual_seqlens)
+    kv_cu_seqlens = paddle.concat([paddle.zeros([1], dtype=paddle.int32), kv_cu_seqlens], axis=0)
+    return q_cu_seqlens, kv_cu_seqlens
+def divide(numerator: int, denominator: int) -> int:
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    assert (numerator % denominator == 0), f"{numerator} is not divisible by {denominator}"
+    return numerator // denominator