bert-large

5988d2cc · yuguo960516 · 478602ba · 5988d2cc · 5988d2cc · 5988d2cc
Commit 5988d2cc authored Mar 29, 2023 by yuguo960516
20 changed files
--- a/libai/layers/__pycache__/activation.cpython-39.pyc
+++ b/libai/layers/__pycache__/activation.cpython-39.pyc
--- a/libai/layers/__pycache__/attention.cpython-39.pyc
+++ b/libai/layers/__pycache__/attention.cpython-39.pyc
--- a/libai/layers/__pycache__/cross_entropy.cpython-39.pyc
+++ b/libai/layers/__pycache__/cross_entropy.cpython-39.pyc
--- a/libai/layers/__pycache__/droppath.cpython-39.pyc
+++ b/libai/layers/__pycache__/droppath.cpython-39.pyc
--- a/libai/layers/__pycache__/embedding.cpython-39.pyc
+++ b/libai/layers/__pycache__/embedding.cpython-39.pyc
--- a/libai/layers/__pycache__/layer_norm.cpython-39.pyc
+++ b/libai/layers/__pycache__/layer_norm.cpython-39.pyc
--- a/libai/layers/__pycache__/linear.cpython-39.pyc
+++ b/libai/layers/__pycache__/linear.cpython-39.pyc
--- a/libai/layers/__pycache__/lm_logits.cpython-39.pyc
+++ b/libai/layers/__pycache__/lm_logits.cpython-39.pyc
--- a/libai/layers/__pycache__/mlp.cpython-39.pyc
+++ b/libai/layers/__pycache__/mlp.cpython-39.pyc
--- a/libai/layers/__pycache__/transformer_layer.cpython-39.pyc
+++ b/libai/layers/__pycache__/transformer_layer.cpython-39.pyc
--- a/libai/layers/activation.py
+++ b/libai/layers/activation.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Optional
+
+import oneflow as flow
+from oneflow import nn
+
+
+class Activation(str, Enum):
+    SquaredReLU = "squared_relu"
+    GeLU = "gelu"
+    GeLUTanh = "gelu_tanh"
+    LeakyReLU = "leaky_relu"
+    ReLU = "relu"
+    Tanh = "tanh"
+    QuickGELU = "quick_gelu"
+
+
+# For unit testing / parity comparisons, probably not the fastest way
+class SquaredReLU(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: flow.Tensor) -> flow.Tensor:
+        x_ = flow._C.relu(x)
+        return x_ * x_
+
+
+class Passthrough(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: flow.Tensor) -> flow.Tensor:
+        return x
+
+
+class GeLUTanh(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: flow.Tensor) -> flow.Tensor:
+        """When the approximate argument is 'tanh', Gelu is estimated with:
+        0.5 * x * (1.0 + flow.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * flow.pow(x, 3.0))))
+        """
+        return flow.nn.functional.gelu(x, approximate="tanh")
+
+
+class QuickGELU(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: flow.Tensor) -> flow.Tensor:
+        return x * flow.sigmoid(1.702 * x)
+
+
+def build_activation(activation: Optional[Activation]):
+    """
+    Fetching activation layers by name, e.g.,
+    ``build_activation("gelu")`` returns ``nn.GELU()`` module.
+    """
+    if not activation:
+        return Passthrough()
+
+    return {
+        Activation.ReLU: nn.ReLU,
+        Activation.GeLU: nn.GELU,
+        Activation.GeLUTanh: GeLUTanh,
+        Activation.LeakyReLU: nn.LeakyReLU,
+        Activation.SquaredReLU: SquaredReLU,
+        Activation.Tanh: nn.Tanh,
+        Activation.QuickGELU: QuickGELU,
+    }[activation]()
--- a/libai/layers/attention.py
+++ b/libai/layers/attention.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+import math
+from typing import Tuple
+
+import oneflow as flow
+from oneflow import nn
+
+from .linear import Linear
+
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
+
+
+class MultiheadAttention(nn.Module):
+    """Multi-head attention layer, support self attention and cross attention.
+
+    Args:
+        hidden_size: size of hidden state.
+        num_attention_heads: number of attention heads.
+        is_cross_attention: used to specify whether it is self attention or cross attention.
+            Defaults to False.
+        attention_dropout_prob: dropout probability of attention weights.
+            Defaults to 0.0.
+        output_dropout_prob: dropout probability of output. Defaults to 0.0.
+        init_method: method to initialize the input layer weights.
+            Defaults to ``init.xavier_normal_``.
+        output_layer_init_method: method to initialize the output layer weights.
+            If None, use ``init_method``.
+        bias_dropout_fusion: whether to fuse add bias and dropout.
+            Defaults to False.
+        scale_mask_softmax_fusion: whether to fuse scale, mask and softmax.
+            Defaults to False.
+        apply_query_key_layer_scaling: if `True`, scaling the attention score by layer index.
+            Defaults to False.
+        layer_idx: a layer_idx sign which determines the placements.
+            It will be used in pipeline parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        is_cross_attention=False,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        attn_mask_type=AttnMaskType.padding,
+        *,
+        layer_idx=0
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        assert (
+            hidden_size % num_attention_heads == 0
+        ), "hidden_size must be divisible by num_attention_heads."
+
+        self.num_heads = num_attention_heads
+        self.head_size = hidden_size // num_attention_heads
+        self.attn_mask_type = attn_mask_type
+
+        self.attention_dropout_prob = attention_dropout_prob
+        self.dropout = nn.Dropout(p=attention_dropout_prob)
+        self.norm_factor = 1.0 / math.sqrt(float(self.head_size))
+        self.coeff = None
+        if apply_query_key_layer_scaling:
+            self.coeff = layer_idx + 1
+            self.norm_factor /= self.coeff
+
+        self.is_cross_attention = is_cross_attention
+        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
+        self.bias_dropout_fusion = bias_dropout_fusion
+
+        if self.bias_dropout_fusion:
+            self.output_dropout_prob = output_dropout_prob
+        else:
+            self.output_dropout = nn.Dropout(p=output_dropout_prob)
+
+        if self.is_cross_attention:
+            self.query = Linear(
+                self.hidden_size,
+                self.hidden_size,
+                parallel="col",
+                init_method=init_method,
+                layer_idx=layer_idx,
+            )
+            self.key_value = Linear(
+                self.hidden_size,
+                self.hidden_size * 2,
+                parallel="col",
+                init_method=init_method,
+                layer_idx=layer_idx,
+            )
+        else:
+            self.query_key_value = Linear(
+                self.hidden_size,
+                self.hidden_size * 3,
+                parallel="col",
+                init_method=init_method,
+                layer_idx=layer_idx,
+            )
+
+        self.dense = Linear(
+            self.hidden_size,
+            self.hidden_size,
+            parallel="row",
+            init_method=output_layer_init_method,
+            skip_bias_add=self.bias_dropout_fusion,
+            layer_idx=layer_idx,
+        )
+
+    def forward(
+        self,
+        hidden_states: flow.Tensor,
+        encoder_states: flow.Tensor = None,
+        attention_mask: flow.Tensor = None,
+        past_key_value: Tuple[flow.Tensor, flow.Tensor] = None,
+        use_cache: bool = False,
+    ):
+        """
+
+        Args:
+            hidden_states (flow.Tensor): shape is [bsz, tgt_len, hidden_size].
+            encoder_states (flow.Tensor, optional): shape is [bsz, src_len, hidden_size].
+                Defaults to None.
+            attention_mask (flow.Tensor, optional): shape is [bsz, 1, tgt_len, src_len].
+                It should be the combination of padding mask and casual mask.
+                It is the padding mask of source input when used with self-attention in encoder.
+                And it is the combination of padding mask of target input and casual mask when
+                used with self-attention in decoder. It is the padding mask of source input when
+                used with cross-attention in decoder.
+                Defaults to None.
+            past_key_value (Tuple[flow.Tensor, flow.Tensor], optional): tuple of key and value,
+                each shape is [bsz, num_heads, src_len, head_size]. Defaults to None.
+            use_cache (bool, optional): it will be set to True, when the model is in the inference
+                phase and used for incremental decoding. Defaults to False.
+        """
+
+        # hidden_states, encoder_states: [S(0), B]
+        # attention_mask: [S(0), B]
+
+        if encoder_states is not None:
+            encoder_states = encoder_states.to_global(placement=hidden_states.placement)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.to_global(placement=hidden_states.placement)
+
+        bsz, tgt_len = hidden_states.size()[:2]
+
+        if self.is_cross_attention:
+            # if it is cross attention, key and value should be calculated only once, and the
+            # result can be reused.
+            query = self.query(hidden_states)
+            query = query.view(bsz, -1, self.num_heads, self.head_size)
+            query = query.permute(0, 2, 1, 3)
+            if past_key_value is not None:
+                key, value = past_key_value
+            elif encoder_states is not None:
+                key_value = self.key_value(encoder_states)
+                key_value = key_value.view(bsz, -1, self.num_heads, 2 * self.head_size)
+                key_value = key_value.permute(0, 2, 1, 3)
+                key, value = flow.chunk(key_value, chunks=2, dim=-1)
+            else:
+                raise ValueError(
+                    "past_key_value and encoder_states cannot be None at the same time."
+                )
+        else:
+            # if it is self attention, query, key, and value are all obtained from hidden_states.
+            # when in the inference phase of an incremental decoder,
+            # hidden_states is the last-added state,
+            # the full key and value could be obtained by concatenating with past_key_value.
+            query_key_value = self.query_key_value(hidden_states)
+            query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size)
+            query_key_value = query_key_value.permute(
+                0, 2, 1, 3
+            )  # [bsz, num_heads, src_len, 3 * head_size]
+            query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1)
+            if past_key_value is not None:
+                past_key, past_value = past_key_value
+                key = flow.cat((past_key.type_as(key), key), dim=2)
+                value = flow.cat((past_value.type_as(value), value), dim=2)
+
+        # query, key, value: [S(0), S(1)], shape: [bsz, num_heads, seq_length, head_size]
+        if use_cache:
+            past_key_value = (key, value)
+
+        # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)]
+        attention_scores = flow.matmul(query, key, transpose_b=True, alpha=self.norm_factor)
+
+        # [S(0), S(1)] x [S(0), B] = [S(0), S(1)]
+        if attention_mask is not None:
+            if self.scale_mask_softmax_fusion:
+                if self.attn_mask_type == AttnMaskType.padding:
+                    attention_mask = (
+                        attention_mask.expand_as(attention_scores) if use_cache else attention_mask
+                    )
+                    attention_weights = flow._C.fused_scale_mask_softmax_dropout(
+                        attention_scores,
+                        attention_mask,
+                        fill_value=-10000.0,
+                        scale=self.coeff,
+                        p=self.attention_dropout_prob,
+                    )[0]
+            else:
+                if self.coeff is not None:
+                    attention_scores *= self.coeff
+                attention_scores = flow.mul(attention_scores, attention_mask)
+                attention_scores = attention_scores - 10000.0 * (1 - attention_mask)
+                # TODO(xingyu.liao): graph will occur `where_scalar` errors
+                # when using `masked_fill`
+                # attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0)
+                attention_weights = flow.softmax(attention_scores, dim=-1)
+                # [bsz, num_heads, tgt_len, src_len]
+                attention_weights = self.dropout(attention_weights)
+        else:
+            if self.scale_mask_softmax_fusion and self.attn_mask_type == AttnMaskType.causal:
+                attention_weights = flow._C.fused_scale_tril_softmax_mask_scale(
+                    attention_scores,
+                    p=self.attention_dropout_prob,
+                    diagonal=0,
+                    tril_scale_value=self.coeff,
+                    tril_fill_value=-10000.0,
+                )[0]
+            else:
+                attention_weights = flow.softmax(attention_scores, dim=-1)
+                # [bsz, num_heads, tgt_len, src_len]
+                attention_weights = self.dropout(attention_weights)
+
+        # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)]
+        context = flow.matmul(attention_weights, value)
+        # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size]
+        context = context.transpose(1, 2)
+
+        # Concat multi-head results from
+        # [bsz, tgt_len, num_heads, head_size] -> [bsz, tgt_len, num_heads * head_size]
+        # SBP sign: [S(0), S(2)]
+        # [S(0), S(2)] x [B, S(0)] = [S(0), P] -> [S(0), B]
+        output = self.dense(context.flatten(2))
+
+        if self.bias_dropout_fusion:
+            output, bias = output
+            output = flow._C.fused_bias_add_dropout(
+                output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
+            )
+        else:
+            output = self.output_dropout(output)
+
+        if use_cache:
+            output = (output, past_key_value)
+
+        return output
+
+    def extra_repr(self) -> str:
+        return "hidden_size={}, num_heads={}, is_cross_attention={}".format(
+            self.hidden_size,
+            self.num_heads,
+            self.is_cross_attention,
+        )
--- a/libai/layers/cross_entropy.py
+++ b/libai/layers/cross_entropy.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import oneflow as flow
+from oneflow import nn
+
+
+class ParallelCrossEntropyLoss(nn.Module):
+    """This criterion acts like :class:`~flow.nn.CrossEntropyLoss` except it will
+    execute distributed cross entropy loss computation cross different GPUs.
+    """
+
+    def forward(self, logits: flow.Tensor, target: flow.Tensor):
+        """Function for the distributed cross entropy.
+
+        Args:
+            logits (flow.Tensor): vocab_parallel_logits with shape
+                (batch_size, seq_length, vocab_size) and sbp signature is [S(0), S(2)].
+            target (flow.Tensor): target with shape (batch_size, seq_length) and
+                sbp signature is [S(0), B].
+        """
+        assert logits.ndim == 3
+        assert target.ndim == 2
+        assert logits.shape[0:2] == target.shape
+
+        target = target.to_global(placement=logits.placement)
+
+        # Change -1 in target to 0 because sparse_softmax_cross_entropy don't accept -1
+        target = target * (target >= 0)
+
+        lm_loss = flow._C.sparse_softmax_cross_entropy(
+            logits.view(-1, logits.shape[-1]),
+            target.view(-1),
+        )
+        return lm_loss
--- a/libai/layers/droppath.py
+++ b/libai/layers/droppath.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+import oneflow.nn as nn
+
+
+def drop_path(x, drop_prob: float = 0.5, training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+
+    # similar opeartion to new_tensor(shape).bernoulli_(keep_prob)
+    random_tensor = flow.rand(*shape, dtype=x.dtype, sbp=x.sbp, placement=x.placement)
+    random_tensor = (random_tensor < keep_prob).to(flow.float32)
+
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor = random_tensor / keep_prob
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
--- a/libai/layers/embedding.py
+++ b/libai/layers/embedding.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import oneflow as flow
+from oneflow import nn
+from oneflow.nn import init
+
+from libai.utils import distributed as dist
+
+
+class Embedding(nn.Module):
+    """Construct the trainable embedding module, which does not support parallelization.
+    This can be used for positional embedding and token type embedding.
+
+    Arguments:
+        num_embeddings: size of vocabulary.
+        embedding_dim: dimension of embeddings.
+        padding_idx: pad index. Defaults to None.
+        init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
+        amp_enabled: fp16 option for embedding weight. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        init_method=init.xavier_normal_,
+        amp_enabled=False,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.init_method = init_method
+        self.amp_enabled = amp_enabled
+
+        assert num_embeddings > 0
+        self.weight = nn.Parameter(
+            flow.empty(
+                (num_embeddings, embedding_dim),
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+        self.init_method(self.weight)
+        # FIXME(lxy): Fill padding_idx is not supported in nd_sbp right now.
+        # self._fill_padding_idx_with_zero()
+
+    def forward(self, input_ids):
+        weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
+        # embeddings with sbp sign: [B, B]
+        #   [B, B] x [S(0), B] --> [S(0), B]
+        #     ↑         ↑              ↑
+        #   embed    pos_ids       pos_embed
+        input_embeds = flow._C.gather(weight, input_ids, axis=0)
+        return input_embeds
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with flow.no_grad():
+                self.weight[self.padding_idx] = flow.zeros(
+                    self.embedding_dim,
+                    placement=dist.get_layer_placement(0),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+
+    def extra_repr(self) -> str:
+        s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        return s.format(**self.__dict__)
+
+
+class VocabEmbedding(nn.Module):
+    """Construct the word embeddings, which may be split along vocabulary dimension.
+
+    Arguments:
+        num_embeddings: size of vocabulary.
+        embedding_dim: dimension of embeddings.
+        padding_idx: pad index. Defaults to None.
+        init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
+        amp_enabled: fp16 option for embedding weight. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        init_method=init.xavier_normal_,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.init_method = init_method
+        self.amp_enabled = amp_enabled
+
+        # Word token embedding shape with (vocab_size, hidden_size)
+        # sbp: [B, S(0)]
+        self.weight = nn.Parameter(
+            flow.empty(
+                (num_embeddings, embedding_dim),
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(0),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
+            )
+        )
+        # Initialize the word embedding
+        self.init_method(self.weight)
+        # FIXME(Lxy): Fill padding_idx is not supported in nd_sbp right now.
+        # self._fill_padding_idx_with_zero()
+
+    def forward(self, input_ids):
+        weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
+        # input_ids with shape (batch_size, seq_len), and sbp sign: [S(0), B]
+
+        # Gather forward sbp sign
+        # [B, S(0)] x [S(0), B] --> [S(0), P]
+        #     ↑           ↑            ↑
+        #   embed  input_ids    input_embeds
+        input_embeds = flow._C.gather(weight, input_ids, axis=0)
+        # Set the embeds sbp from [S(0), P] --> [S(0), B] to get complete embedding results.
+        input_embeds = input_embeds.to_global(sbp=dist.get_hidden_sbp())
+
+        return input_embeds
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with flow.no_grad():
+                self.weight[self.padding_idx] = flow.zeros(
+                    self.embedding_dim,
+                    placement=dist.get_layer_placement(0),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+
+    def extra_repr(self) -> str:
+        s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        return s.format(**self.__dict__)
+
+
+class SinePositionalEmbedding(nn.Module):
+    """Construct the sinusoidal positional embeddings.
+
+    Arguments:
+        num_embeddings: size of vocabulary.
+        embedding_dim: dimension of embeddings.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim):
+        super().__init__()
+
+        self.embedding_dim = embedding_dim
+        self.num_embeddings = num_embeddings
+
+        position_embedding = flow.zeros(
+            num_embeddings,
+            embedding_dim,
+            dtype=flow.float32,
+            placement=dist.get_layer_placement(0),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+        )
+        position = flow._C.global_arange(
+            start=0,
+            end=num_embeddings,
+            placement=dist.get_layer_placement(0),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            dtype=flow.float32,
+        ).unsqueeze(1)
+        position_range = flow._C.global_arange(
+            start=0,
+            end=embedding_dim,
+            step=2,
+            placement=dist.get_layer_placement(0),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            dtype=flow.float32,
+        )
+        div_term = flow.exp(position_range * (-math.log(10000.0) / embedding_dim))
+
+        position_embedding[:, 0::2] = flow.sin(position * div_term)
+        position_embedding[:, 1::2] = flow.cos(position * div_term)
+        self.register_buffer("position_embedding", position_embedding)
+
+    def forward(self, position_ids):
+        position_embeds = flow._C.gather(self.position_embedding, position_ids, axis=0)
+        return position_embeds
+
+    def extra_repr(self) -> str:
+        s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
+        return s.format(**self.__dict__)
+
+
+class PatchEmbedding(nn.Module):
+    """2D Image to Patch Embedding
+
+    Arguments:
+        img_size: size of input image. Default to 224.
+        patch_size: embedded patch size. Default to 16.
+        in_chans: input channel's size. Default to 3.
+        embed_dim: dimension of embedded patch. Default to 768.
+        norm_layer: normalization patch embedding or not. Default to None.
+        flatten: flatten patch embedding or keep the 2-D shape. Default to True.
+        layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
+        parallelism. Default to 0.
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        *,
+        layer_idx=0,
+    ):
+        super().__init__()
+        img_size = img_size if isinstance(img_size, tuple) else (img_size, img_size)
+        patch_size = patch_size if isinstance(patch_size, tuple) else (patch_size, patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        ).to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(layer_idx),
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert (
+            H == self.img_size[0]
+        ), f"Input image height ({H}) doesn't match model ({self.img_size[0]})."
+        assert (
+            W == self.img_size[1]
+        ), f"Input image width ({W}) doesn't match model ({self.img_size[1]})."
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
--- a/libai/layers/layer_norm.py
+++ b/libai/layers/layer_norm.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class LayerNorm(nn.Module):
+    """Applies Layer Normalization over a mini-batch of inputs in 1D parallelism.
+
+    Args:
+        normalized_shape: input shape from an expected input of size.
+        eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
+            elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
+        layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
+            parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self, normalized_shape, eps=1e-5, elementwise_affine=True, bias=True, *, layer_idx=0
+    ):
+        super().__init__()
+        if isinstance(normalized_shape, int):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = tuple(normalized_shape)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        self.layer_idx = layer_idx
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(
+                flow.ones(
+                    normalized_shape,
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+            )
+            self.bias = nn.Parameter(
+                flow.zeros(
+                    normalized_shape,
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                ),
+                requires_grad=bias,
+            )
+        else:
+            self.weight = None
+            self.bias = None
+
+    def forward(self, x):
+        assert x.shape[-len(self.normalized_shape) :] == self.normalized_shape
+        begin_norm_axis = x.ndim - len(self.normalized_shape)
+        begin_params_axis = x.ndim - len(self.normalized_shape)
+        if self.elementwise_affine:
+            y = flow._C.layer_norm_affine(
+                x,
+                self.weight,
+                self.bias,
+                begin_norm_axis=begin_norm_axis,
+                begin_params_axis=begin_params_axis,
+                epsilon=self.eps,
+            )
+        else:
+            y = flow._C.layer_norm(
+                x,
+                begin_norm_axis=begin_norm_axis,
+                begin_params_axis=begin_params_axis,
+                epsilon=self.eps,
+            )
+        return y
+
+    def extra_repr(self) -> str:
+        return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
+            **self.__dict__
+        )
+
+
+class RMSLayerNorm(nn.Module):
+    """T5 uses a layer_norm which only scales and doesn't shift, which is also known as
+    Root Mean Square Layer Normalization thus varience is calculated w/o mean and
+    there is no bias. More details see: https://arxiv.org/abs/1910.07467.
+
+    Args:
+        normalized_shape: input shape from an expected input of size.
+        eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
+            elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
+            parallelism. Defaults to 0.
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, layer_idx=0):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.weight = flow.nn.Parameter(
+            flow.ones(
+                normalized_shape,
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+        self.l2norm_epsilon = eps
+
+    def forward(self, hidden_states):
+        return flow._C.rms_layer_norm(hidden_states, self.weight, self.l2norm_epsilon)
--- a/libai/layers/linear.py
+++ b/libai/layers/linear.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class Linear1D(nn.Module):
+    r"""Linear layer with 1D parallelism which includes column parallelism and row parallelism.
+    The linear layer is defined as :math:`y = xA^T + b`.
+
+    In column parallelism, A^T is parallelized along the second dimension
+    as :math:`A^T = [A_1, ..., A_p]`.
+
+    In row parallelism, A^T is parallelized along the first dimension and X along its second
+    dimension as:
+
+    .. math::
+        A^T = \begin{bmatrix}
+                 A\_1 \\
+                 . \\
+                 . \\
+                 . \\
+                 A\_p
+        \end{bmatrix}
+        x = \begin{bmatrix}
+                 x\_1 & ... & x\_p
+        \end{bmatrix}
+
+    Arguments:
+        in_features: size of each input sample.
+        out_features: size of each output sample.
+        bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
+        parallel: Parallel mode. Defaults to "data".
+        init_method: method to initialize weight. Defaults to :func:`nn.init.xavier_normal_`.
+        skip_bias_add: skip adding bias but instead return it, so that adding bias can be fused with
+            other elementwise operations. Defaults to ``False``.
+        layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
+            parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        parallel="data",
+        init_method=nn.init.xavier_normal_,
+        skip_bias_add=False,
+        *,
+        layer_idx=0,  # enforce layer_idx passed with keyword
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.parallel = parallel
+        self.skip_bias_add = skip_bias_add
+
+        if parallel == "col":
+            # Column parallel
+            # weight sbp sign: [B, S(0)], weight will be transposed when performing matmul
+            # so weight sbp sign actually be [B, S(1)]
+            # bias sbp sign: [B, S(0)]
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+        elif parallel == "row":
+            # Row parallel
+            # weight sbp sign: [B, S(1)], weight will be transposed when performing matmul
+            # so weight sbp sign actually be [B, S(1)]
+            # bias sbp sign: [B, B]
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        elif parallel == "data":
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        else:
+            raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
+
+        self.weight = flow.nn.Parameter(
+            flow.empty(
+                (out_features, in_features),
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(layer_idx),  # for pipeline parallelism placement
+                sbp=weight_sbp,
+            )
+        )
+        init_method(self.weight)
+
+        self.bias = (
+            flow.nn.Parameter(
+                flow.zeros(
+                    (out_features,),
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=bias_sbp,
+                )
+            )
+            if bias
+            else None
+        )
+
+    def forward(self, x):
+        if dist.same_sbp(self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])):
+            # If the last dim of weight sbp sign is S(0), then last dim of weight.t sbp
+            # sign is S(1), so the last dim of x sbp sign must be B.
+            if self.weight.sbp[-1] == flow.sbp.split(0):
+                x_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
+                x = x.to_global(sbp=x_sbp)
+
+            # x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
+            x = x.to_global(grad_sbp=x.sbp)
+            x = flow.matmul(x, self.weight, transpose_b=True)
+
+        elif dist.same_sbp(
+            self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
+        ):
+            # If the last dim of weight sbp sign is S(1), then last dim of weight.t sbp
+            # sign is S(0), so the last dim of x sbp sign must be S(ndim-1).
+            if self.weight.sbp[-1] == flow.sbp.split(1):
+                x_sbp = x.sbp[:-1] + (flow.sbp.split(x.ndim - 1),)
+                x = x.to_global(sbp=x_sbp)
+                out_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
+            else:
+                out_sbp = x.sbp
+
+            x = flow.matmul(x, self.weight, transpose_b=True)
+            # Change x.sbp for followup forward pass.
+            # This line can be removed when sbp can be auto inferred.
+            x = x.to_global(sbp=out_sbp)
+        elif dist.same_sbp(
+            self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        ):
+            # x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
+            x = x.to_global(grad_sbp=x.sbp)
+            # NOTE(chengcheng): when input x is [S(0), B], there is no need to change sbp for x.
+            # x = x.to_global(sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(0)]))
+            x = flow.matmul(x, self.weight, transpose_b=True)
+        else:
+            # Not supported weight_sbp, deduce sbp and communicate with nccl automatically.
+            x = flow.matmul(x, self.weight, transpose_b=True)
+
+        if self.bias is not None:
+            if self.skip_bias_add:
+                return x, self.bias
+            else:
+                return x + self.bias
+        else:
+            return x
+
+    def extra_repr(self) -> str:
+        return "in_features={}, out_features={}, bias={}, parallel={}".format(
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.parallel,
+        )
+
+
+# Give an alias for Linear1d
+Linear = Linear1D
--- a/libai/layers/lm_logits.py
+++ b/libai/layers/lm_logits.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class LMLogits(nn.Module):
+    def __init__(self, vocab_size, bias=False):
+        super().__init__()
+        self.bias = (
+            nn.Parameter(
+                flow.zeros(
+                    (vocab_size,),
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(-1),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
+                )
+            )
+            if bias
+            else None
+        )
+
+    def forward(self, input, word_embeddings):
+        """LM logits using word embedding weights"""
+        # input with sbp sign [S(0), B] and word_embeddings with sbp sign [S(0), B]
+
+        # NOTE(l1aoxingyu): This is for pipeline parallelism
+        # change word embedding placement from stage(0) to stage(-1)
+        w = word_embeddings.to_global(placement=input.placement)
+
+        # NOTE(l1aoxingyu): input x embed^T = logits with sbp sign
+        # [S(0), B] x [B, S(1)] --> [S(0), S(1)]
+        #     ↑          ↑               ↑
+        #   input      embed^T         logits
+        # Backward pass input.grad = logits.grad x embed with sbp sign
+        # [S(0), S(1)] x [B, S(0)] --> [S(0), P]
+        #     ↑             ↑               ↑
+        #  logits.grad    embed        input.grad
+        # When use input.grad as head node for backward pass, need to convert
+        # its sbp sign fromm [S(0), P] --> [S(0), B]
+        input = input.to_global(grad_sbp=input.sbp)
+
+        logits = flow._C.matmul(input, w, transpose_b=True)
+        if self.bias is not None:
+            logits = logits + self.bias
+        return logits
--- a/libai/layers/mlp.py
+++ b/libai/layers/mlp.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.layers import Linear, build_activation
+
+
+class MLP(nn.Module):
+    """MLP
+
+    MLP will take the input with h hidden state, project it to intermediate
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension.
+
+    Arguments:
+        hidden_size: size of each input and output sample.
+        ffn_hidden_size: size of each intermediate sample.
+        output_dropout_prob: Output dropout probability. Defaults to 0.0.
+        init_method: method to initialize the first linear weight.
+            Defaults to :func:`nn.init.xavier_normal_`.
+        output_layer_init_method: method to initialize the second linear weight. If set to None,
+            it will use ``init_method`` instead. Defaults to None.
+        bias_gelu_fusion: If set to ``True``, it will fuse bias adding and elementwise
+            gelu activation. Defaults to ``False``.
+        bias_dropout_fusion: If set to ``True``, it will fuse bias adding and dropout.
+            Defaults to ``False``.
+        layer_idx: A layer_idx sign which determines the placement. It will be used in
+            pipeline parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        ffn_hidden_size,
+        output_dropout_prob=0.0,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        *,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.output_dropout_prob = output_dropout_prob
+        self.bias_gelu_fusion = bias_gelu_fusion
+        self.bias_dropout_fusion = bias_dropout_fusion
+
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        self.dense_h_to_4h = Linear(
+            hidden_size,
+            ffn_hidden_size,
+            bias=True,
+            parallel="col",
+            skip_bias_add=bias_gelu_fusion,
+            init_method=init_method,
+            layer_idx=layer_idx,
+        )
+
+        if not bias_gelu_fusion:
+            self.activation_func = build_activation("gelu")
+
+        self.dense_4h_to_h = Linear(
+            ffn_hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="row",
+            skip_bias_add=bias_dropout_fusion,
+            init_method=output_layer_init_method,
+            layer_idx=layer_idx,
+        )
+
+        if not bias_dropout_fusion:
+            self.dropout = nn.Dropout(self.output_dropout_prob)
+
+    def forward(self, hidden_states):
+        intermediate = self.dense_h_to_4h(hidden_states)
+        if self.bias_gelu_fusion:
+            intermediate, bias = intermediate
+            intermediate = flow._C.fused_bias_add_gelu(
+                intermediate, bias, axis=intermediate.ndim - 1
+            )
+        else:
+            intermediate = self.activation_func(intermediate)
+
+        output = self.dense_4h_to_h(intermediate)
+        if self.bias_dropout_fusion:
+            output, bias = output
+            output = flow._C.fused_bias_add_dropout(
+                output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
+            )
+        else:
+            output = self.dropout(output)
+        return output
+
+    def extra_repr(self) -> str:
+        return "bias_gelu_fusion={}, bias_dropout_fusion={}, dropout={}".format(
+            self.bias_gelu_fusion, self.bias_dropout_fusion, self.output_dropout_prob
+        )
--- a/libai/layers/transformer_layer.py
+++ b/libai/layers/transformer_layer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow.nn as nn
+
+from libai.utils import distributed as dist
+
+from .attention import AttnMaskType, MultiheadAttention
+from .droppath import DropPath
+from .layer_norm import LayerNorm
+from .mlp import MLP
+
+
+class TransformerLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [bsz, seq_length, hidden size] and returns an
+    output of the same size.
+    The input and output has same sbp sign, (S(0), B).
+
+    Arguments:
+        hidden_size: size of hidden state.
+        ffn_hidden_size: size of feed forword neural network.
+        num_attention_heads: number of attention heads.
+        is_decoder: used to specify whether this is transformer encoder layer or transformer
+            decoder layer. Default: ``False``.
+        attention_dropout_prob: dropout probability of attention weights.
+        output_dropout_prob: dropout probability of output.
+        layernorm_epsilon: epsilon used in layernorm layer. Default: `1e-5`.
+        init_method: method to initialize the input layer weights.
+        output_layer_init_method: method to initialize the output layer weights.
+            If None, use `init_method`.
+        bias_gelu_fusion: whether fuse add bias and gelu. Default: ``False``.
+        bias_dropout_fusion: whether fuse add bias and dropout. Default: ``False``.
+        scale_mask_softmax_fusion: whether to fuse scale, mask and softmax. Default: ``False``.
+        apply_query_key_layer_scaling: if `true`, scaling the attention score by layer index.
+            Default: ``False``.
+        apply_residual_post_layernorm: if ``true``, use original BERT residual
+            connection ordering. Otherwise, use Megatron BERT residual connection which
+            is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        layer_idx: the layer index, which determines the placement.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        ffn_hidden_size,
+        num_attention_heads,
+        is_decoder=False,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        drop_path_prob=0.0,
+        layernorm_epsilon=1e-5,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        apply_residual_post_layernorm=False,
+        attn_mask_type=AttnMaskType.padding,
+        *,
+        layer_idx=0
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout_prob = attention_dropout_prob
+        self.output_dropout_prob = output_dropout_prob
+        self.layernorm_epsilon = layernorm_epsilon
+        self.attn_mask_type = attn_mask_type
+
+        self.layer_idx = layer_idx
+        self.is_decoder = is_decoder
+
+        self.bias_gelu_fusion = bias_gelu_fusion
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.apply_residual_post_layernorm = apply_residual_post_layernorm
+
+        self.init_method = init_method
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        self.output_layer_init_method = output_layer_init_method
+
+        self.drop_path = DropPath(drop_path_prob) if drop_path_prob > 0.0 else nn.Identity()
+
+        self.input_layernorm = LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
+        )
+
+        self.self_attention = self.build_attention(is_cross_attention=False)
+        self.post_attention_layernorm = LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
+        )
+
+        if self.is_decoder:
+            self.cross_attention = self.build_attention(is_cross_attention=True)
+            self.post_cross_attention_layernorm = LayerNorm(
+                self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
+            )
+
+        self.mlp = MLP(
+            self.hidden_size,
+            self.ffn_hidden_size,
+            self.output_dropout_prob,
+            self.init_method,
+            output_layer_init_method=self.output_layer_init_method,
+            bias_gelu_fusion=self.bias_gelu_fusion,
+            bias_dropout_fusion=self.bias_dropout_fusion,
+            layer_idx=self.layer_idx,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        use_cache=False,
+    ):
+        """
+        Args:
+            hidden_states: shape is (batch_size, seq_length, hidden_size),
+                sbp signature is (S(0), B).
+            attention_mask: the combination of key padding mask and casual mask of hidden states
+                with shape (batch_size, 1, seq_length, seq_length) and the sbp
+                signature is (S(0), B),
+            encoder_states: encoder output with shape (batch_size, seq_length, hidden_size)
+                and the sbp signature is (S(0), B), which will be used in cross attention.
+            encoder_attention_mask: key padding mask of encoder states with shape
+                (batch_size, 1, seq_length, seq_length) and the sbp signature is (S(0), B).
+            past_key_value: tuple of key and value, each shape is
+                (seq_length, bsz, num_heads, head_size), For decoder layer,
+                the past_key_value contains the states both from self attention
+                and cross attention.
+            use_cache: it will be set to `True` when the model is in the inference phase and
+                used for incremental decoding.
+        """
+        # Change placement for pipeline parallelsim
+        hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx))
+
+        # hidden_states shape: (batch_size, seq_length, hidden_size)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to_global(
+                placement=dist.get_layer_placement(self.layer_idx)
+            )
+
+        if past_key_value is not None:
+            if self.is_decoder:
+                assert len(past_key_value) == 4
+                self_attn_past_key_value = past_key_value[:2]
+                cross_attn_past_key_value = past_key_value[2:]
+            else:
+                self_attn_past_key_value = past_key_value
+                cross_attn_past_key_value = None
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        layernorm_output = self.input_layernorm(hidden_states)
+        attention_output = self.self_attention(
+            layernorm_output,
+            attention_mask=attention_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+        )
+        attention_output = self.drop_path(attention_output)
+
+        if use_cache:
+            attention_output, presents = attention_output
+
+        if self.apply_residual_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        hidden_states = residual + attention_output
+
+        layernorm_output = self.post_attention_layernorm(hidden_states)
+
+        if self.is_decoder:
+            attention_output = self.cross_attention(
+                layernorm_output,
+                encoder_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                use_cache=use_cache,
+            )
+
+            if use_cache:
+                attention_output, decoder_presents = attention_output
+                presents += decoder_presents
+
+            attention_output = self.drop_path(attention_output)
+            if self.apply_residual_post_layernorm:
+                residual = layernorm_output
+            else:
+                residual = hidden_states
+
+            hidden_states = residual + attention_output
+            layernorm_output = self.post_cross_attention_layernorm(hidden_states)
+
+        mlp_output = self.mlp(layernorm_output)
+        mlp_output = self.drop_path(mlp_output)
+
+        if self.apply_residual_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        output = residual + mlp_output
+
+        if use_cache:
+            output = (output, presents)
+        return output
+
+    def build_attention(self, is_cross_attention=False):
+        return MultiheadAttention(
+            self.hidden_size,
+            self.num_attention_heads,
+            is_cross_attention=is_cross_attention,
+            attention_dropout_prob=self.attention_dropout_prob,
+            output_dropout_prob=self.output_dropout_prob,
+            init_method=self.init_method,
+            output_layer_init_method=self.output_layer_init_method,
+            bias_dropout_fusion=self.bias_dropout_fusion,
+            scale_mask_softmax_fusion=self.scale_mask_softmax_fusion,
+            apply_query_key_layer_scaling=self.apply_query_key_layer_scaling,
+            attn_mask_type=self.attn_mask_type,
+            layer_idx=self.layer_idx,
+        )