gpt2

3b355d3f · yuguo960516 · fd158e88 · 3b355d3f · 3b355d3f · 3b355d3f
Commit 3b355d3f authored Mar 29, 2023 by yuguo960516
20 changed files
--- a/libai/layers/layer_norm.py
+++ b/libai/layers/layer_norm.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class LayerNorm(nn.Module):
+    """Applies Layer Normalization over a mini-batch of inputs in 1D parallelism.
+
+    Args:
+        normalized_shape: input shape from an expected input of size.
+        eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
+            elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
+        layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
+            parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self, normalized_shape, eps=1e-5, elementwise_affine=True, bias=True, *, layer_idx=0
+    ):
+        super().__init__()
+        if isinstance(normalized_shape, int):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = tuple(normalized_shape)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        self.layer_idx = layer_idx
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(
+                flow.ones(
+                    normalized_shape,
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+            )
+            self.bias = nn.Parameter(
+                flow.zeros(
+                    normalized_shape,
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                ),
+                requires_grad=bias,
+            )
+        else:
+            self.weight = None
+            self.bias = None
+
+    def forward(self, x):
+        assert x.shape[-len(self.normalized_shape) :] == self.normalized_shape
+        begin_norm_axis = x.ndim - len(self.normalized_shape)
+        begin_params_axis = x.ndim - len(self.normalized_shape)
+        if self.elementwise_affine:
+            y = flow._C.layer_norm_affine(
+                x,
+                self.weight,
+                self.bias,
+                begin_norm_axis=begin_norm_axis,
+                begin_params_axis=begin_params_axis,
+                epsilon=self.eps,
+            )
+        else:
+            y = flow._C.layer_norm(
+                x,
+                begin_norm_axis=begin_norm_axis,
+                begin_params_axis=begin_params_axis,
+                epsilon=self.eps,
+            )
+        return y
+
+    def extra_repr(self) -> str:
+        return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(
+            **self.__dict__
+        )
+
+
+class RMSLayerNorm(nn.Module):
+    """T5 uses a layer_norm which only scales and doesn't shift, which is also known as
+    Root Mean Square Layer Normalization thus varience is calculated w/o mean and
+    there is no bias. More details see: https://arxiv.org/abs/1910.07467.
+
+    Args:
+        normalized_shape: input shape from an expected input of size.
+        eps: a value added to the denominator for numerical stability. Defaults to 1e-5.
+            elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        layer_idx: a layer_idx sign which determines the placement. It will be used in pipeline
+            parallelism. Defaults to 0.
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, layer_idx=0):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.weight = flow.nn.Parameter(
+            flow.ones(
+                normalized_shape,
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+        self.l2norm_epsilon = eps
+
+    def forward(self, hidden_states):
+        return flow._C.rms_layer_norm(hidden_states, self.weight, self.l2norm_epsilon)
--- a/libai/layers/linear.py
+++ b/libai/layers/linear.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class Linear1D(nn.Module):
+    r"""Linear layer with 1D parallelism which includes column parallelism and row parallelism.
+    The linear layer is defined as :math:`y = xA^T + b`.
+
+    In column parallelism, A^T is parallelized along the second dimension
+    as :math:`A^T = [A_1, ..., A_p]`.
+
+    In row parallelism, A^T is parallelized along the first dimension and X along its second
+    dimension as:
+
+    .. math::
+        A^T = \begin{bmatrix}
+                 A\_1 \\
+                 . \\
+                 . \\
+                 . \\
+                 A\_p
+        \end{bmatrix}
+        x = \begin{bmatrix}
+                 x\_1 & ... & x\_p
+        \end{bmatrix}
+
+    Arguments:
+        in_features: size of each input sample.
+        out_features: size of each output sample.
+        bias: If set to ``False``, the layer will not learn an additive bias. Defaults to ``True``.
+        parallel: Parallel mode. Defaults to "data".
+        init_method: method to initialize weight. Defaults to :func:`nn.init.xavier_normal_`.
+        skip_bias_add: skip adding bias but instead return it, so that adding bias can be fused with
+            other elementwise operations. Defaults to ``False``.
+        layer_idx: A layer_idx sign which determines the placement. It will be used in pipeline
+            parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        parallel="data",
+        init_method=nn.init.xavier_normal_,
+        skip_bias_add=False,
+        *,
+        layer_idx=0,  # enforce layer_idx passed with keyword
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.parallel = parallel
+        self.skip_bias_add = skip_bias_add
+
+        if parallel == "col":
+            # Column parallel
+            # weight sbp sign: [B, S(0)], weight will be transposed when performing matmul
+            # so weight sbp sign actually be [B, S(1)]
+            # bias sbp sign: [B, S(0)]
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])
+        elif parallel == "row":
+            # Row parallel
+            # weight sbp sign: [B, S(1)], weight will be transposed when performing matmul
+            # so weight sbp sign actually be [B, S(1)]
+            # bias sbp sign: [B, B]
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        elif parallel == "data":
+            weight_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+            bias_sbp = dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        else:
+            raise KeyError(f"{parallel} is not supported! Only support ('data', 'row' and 'col')")
+
+        self.weight = flow.nn.Parameter(
+            flow.empty(
+                (out_features, in_features),
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(layer_idx),  # for pipeline parallelism placement
+                sbp=weight_sbp,
+            )
+        )
+        init_method(self.weight)
+
+        self.bias = (
+            flow.nn.Parameter(
+                flow.zeros(
+                    (out_features,),
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=bias_sbp,
+                )
+            )
+            if bias
+            else None
+        )
+
+    def forward(self, x):
+        if dist.same_sbp(self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)])):
+            # If the last dim of weight sbp sign is S(0), then last dim of weight.t sbp
+            # sign is S(1), so the last dim of x sbp sign must be B.
+            if self.weight.sbp[-1] == flow.sbp.split(0):
+                x_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
+                x = x.to_global(sbp=x_sbp)
+
+            # x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
+            x = x.to_global(grad_sbp=x.sbp)
+            x = flow.matmul(x, self.weight, transpose_b=True)
+
+        elif dist.same_sbp(
+            self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(1)])
+        ):
+            # If the last dim of weight sbp sign is S(1), then last dim of weight.t sbp
+            # sign is S(0), so the last dim of x sbp sign must be S(ndim-1).
+            if self.weight.sbp[-1] == flow.sbp.split(1):
+                x_sbp = x.sbp[:-1] + (flow.sbp.split(x.ndim - 1),)
+                x = x.to_global(sbp=x_sbp)
+                out_sbp = x.sbp[:-1] + (flow.sbp.broadcast,)
+            else:
+                out_sbp = x.sbp
+
+            x = flow.matmul(x, self.weight, transpose_b=True)
+            # Change x.sbp for followup forward pass.
+            # This line can be removed when sbp can be auto inferred.
+            x = x.to_global(sbp=out_sbp)
+        elif dist.same_sbp(
+            self.weight.sbp, dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        ):
+            # x.grad sbp must be x.sbp, otherwise backward pass cannot be performed correctly.
+            x = x.to_global(grad_sbp=x.sbp)
+            # NOTE(chengcheng): when input x is [S(0), B], there is no need to change sbp for x.
+            # x = x.to_global(sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(0)]))
+            x = flow.matmul(x, self.weight, transpose_b=True)
+        else:
+            # Not supported weight_sbp, deduce sbp and communicate with nccl automatically.
+            x = flow.matmul(x, self.weight, transpose_b=True)
+
+        if self.bias is not None:
+            if self.skip_bias_add:
+                return x, self.bias
+            else:
+                return x + self.bias
+        else:
+            return x
+
+    def extra_repr(self) -> str:
+        return "in_features={}, out_features={}, bias={}, parallel={}".format(
+            self.in_features,
+            self.out_features,
+            self.bias is not None,
+            self.parallel,
+        )
+
+
+# Give an alias for Linear1d
+Linear = Linear1D
--- a/libai/layers/lm_logits.py
+++ b/libai/layers/lm_logits.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+
+class LMLogits(nn.Module):
+    def __init__(self, vocab_size, bias=False):
+        super().__init__()
+        self.bias = (
+            nn.Parameter(
+                flow.zeros(
+                    (vocab_size,),
+                    dtype=flow.float32,
+                    placement=dist.get_layer_placement(-1),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
+                )
+            )
+            if bias
+            else None
+        )
+
+    def forward(self, input, word_embeddings):
+        """LM logits using word embedding weights"""
+        # input with sbp sign [S(0), B] and word_embeddings with sbp sign [S(0), B]
+
+        # NOTE(l1aoxingyu): This is for pipeline parallelism
+        # change word embedding placement from stage(0) to stage(-1)
+        w = word_embeddings.to_global(placement=input.placement)
+
+        # NOTE(l1aoxingyu): input x embed^T = logits with sbp sign
+        # [S(0), B] x [B, S(1)] --> [S(0), S(1)]
+        #     ↑          ↑               ↑
+        #   input      embed^T         logits
+        # Backward pass input.grad = logits.grad x embed with sbp sign
+        # [S(0), S(1)] x [B, S(0)] --> [S(0), P]
+        #     ↑             ↑               ↑
+        #  logits.grad    embed        input.grad
+        # When use input.grad as head node for backward pass, need to convert
+        # its sbp sign fromm [S(0), P] --> [S(0), B]
+        input = input.to_global(grad_sbp=input.sbp)
+
+        logits = flow._C.matmul(input, w, transpose_b=True)
+        if self.bias is not None:
+            logits = logits + self.bias
+        return logits
--- a/libai/layers/mlp.py
+++ b/libai/layers/mlp.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.layers import Linear, build_activation
+
+
+class MLP(nn.Module):
+    """MLP
+
+    MLP will take the input with h hidden state, project it to intermediate
+    hidden dimension, perform gelu transformation, and project the
+    state back into h hidden dimension.
+
+    Arguments:
+        hidden_size: size of each input and output sample.
+        ffn_hidden_size: size of each intermediate sample.
+        output_dropout_prob: Output dropout probability. Defaults to 0.0.
+        init_method: method to initialize the first linear weight.
+            Defaults to :func:`nn.init.xavier_normal_`.
+        output_layer_init_method: method to initialize the second linear weight. If set to None,
+            it will use ``init_method`` instead. Defaults to None.
+        bias_gelu_fusion: If set to ``True``, it will fuse bias adding and elementwise
+            gelu activation. Defaults to ``False``.
+        bias_dropout_fusion: If set to ``True``, it will fuse bias adding and dropout.
+            Defaults to ``False``.
+        layer_idx: A layer_idx sign which determines the placement. It will be used in
+            pipeline parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        ffn_hidden_size,
+        output_dropout_prob=0.0,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        *,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.output_dropout_prob = output_dropout_prob
+        self.bias_gelu_fusion = bias_gelu_fusion
+        self.bias_dropout_fusion = bias_dropout_fusion
+
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+
+        self.dense_h_to_4h = Linear(
+            hidden_size,
+            ffn_hidden_size,
+            bias=True,
+            parallel="col",
+            skip_bias_add=bias_gelu_fusion,
+            init_method=init_method,
+            layer_idx=layer_idx,
+        )
+
+        if not bias_gelu_fusion:
+            self.activation_func = build_activation("gelu")
+
+        self.dense_4h_to_h = Linear(
+            ffn_hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="row",
+            skip_bias_add=bias_dropout_fusion,
+            init_method=output_layer_init_method,
+            layer_idx=layer_idx,
+        )
+
+        if not bias_dropout_fusion:
+            self.dropout = nn.Dropout(self.output_dropout_prob)
+
+    def forward(self, hidden_states):
+        intermediate = self.dense_h_to_4h(hidden_states)
+        if self.bias_gelu_fusion:
+            intermediate, bias = intermediate
+            intermediate = flow._C.fused_bias_add_gelu(
+                intermediate, bias, axis=intermediate.ndim - 1
+            )
+        else:
+            intermediate = self.activation_func(intermediate)
+
+        output = self.dense_4h_to_h(intermediate)
+        if self.bias_dropout_fusion:
+            output, bias = output
+            output = flow._C.fused_bias_add_dropout(
+                output, bias, p=self.output_dropout_prob, axis=output.ndim - 1
+            )
+        else:
+            output = self.dropout(output)
+        return output
+
+    def extra_repr(self) -> str:
+        return "bias_gelu_fusion={}, bias_dropout_fusion={}, dropout={}".format(
+            self.bias_gelu_fusion, self.bias_dropout_fusion, self.output_dropout_prob
+        )
--- a/libai/layers/transformer_layer.py
+++ b/libai/layers/transformer_layer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow.nn as nn
+
+from libai.utils import distributed as dist
+
+from .attention import AttnMaskType, MultiheadAttention
+from .droppath import DropPath
+from .layer_norm import LayerNorm
+from .mlp import MLP
+
+
+class TransformerLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [bsz, seq_length, hidden size] and returns an
+    output of the same size.
+    The input and output has same sbp sign, (S(0), B).
+
+    Arguments:
+        hidden_size: size of hidden state.
+        ffn_hidden_size: size of feed forword neural network.
+        num_attention_heads: number of attention heads.
+        is_decoder: used to specify whether this is transformer encoder layer or transformer
+            decoder layer. Default: ``False``.
+        attention_dropout_prob: dropout probability of attention weights.
+        output_dropout_prob: dropout probability of output.
+        layernorm_epsilon: epsilon used in layernorm layer. Default: `1e-5`.
+        init_method: method to initialize the input layer weights.
+        output_layer_init_method: method to initialize the output layer weights.
+            If None, use `init_method`.
+        bias_gelu_fusion: whether fuse add bias and gelu. Default: ``False``.
+        bias_dropout_fusion: whether fuse add bias and dropout. Default: ``False``.
+        scale_mask_softmax_fusion: whether to fuse scale, mask and softmax. Default: ``False``.
+        apply_query_key_layer_scaling: if `true`, scaling the attention score by layer index.
+            Default: ``False``.
+        apply_residual_post_layernorm: if ``true``, use original BERT residual
+            connection ordering. Otherwise, use Megatron BERT residual connection which
+            is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        layer_idx: the layer index, which determines the placement.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        ffn_hidden_size,
+        num_attention_heads,
+        is_decoder=False,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        drop_path_prob=0.0,
+        layernorm_epsilon=1e-5,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        apply_residual_post_layernorm=False,
+        attn_mask_type=AttnMaskType.padding,
+        *,
+        layer_idx=0
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout_prob = attention_dropout_prob
+        self.output_dropout_prob = output_dropout_prob
+        self.layernorm_epsilon = layernorm_epsilon
+        self.attn_mask_type = attn_mask_type
+
+        self.layer_idx = layer_idx
+        self.is_decoder = is_decoder
+
+        self.bias_gelu_fusion = bias_gelu_fusion
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.scale_mask_softmax_fusion = scale_mask_softmax_fusion
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.apply_residual_post_layernorm = apply_residual_post_layernorm
+
+        self.init_method = init_method
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        self.output_layer_init_method = output_layer_init_method
+
+        self.drop_path = DropPath(drop_path_prob) if drop_path_prob > 0.0 else nn.Identity()
+
+        self.input_layernorm = LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
+        )
+
+        self.self_attention = self.build_attention(is_cross_attention=False)
+        self.post_attention_layernorm = LayerNorm(
+            self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
+        )
+
+        if self.is_decoder:
+            self.cross_attention = self.build_attention(is_cross_attention=True)
+            self.post_cross_attention_layernorm = LayerNorm(
+                self.hidden_size, eps=self.layernorm_epsilon, layer_idx=self.layer_idx
+            )
+
+        self.mlp = MLP(
+            self.hidden_size,
+            self.ffn_hidden_size,
+            self.output_dropout_prob,
+            self.init_method,
+            output_layer_init_method=self.output_layer_init_method,
+            bias_gelu_fusion=self.bias_gelu_fusion,
+            bias_dropout_fusion=self.bias_dropout_fusion,
+            layer_idx=self.layer_idx,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        use_cache=False,
+    ):
+        """
+        Args:
+            hidden_states: shape is (batch_size, seq_length, hidden_size),
+                sbp signature is (S(0), B).
+            attention_mask: the combination of key padding mask and casual mask of hidden states
+                with shape (batch_size, 1, seq_length, seq_length) and the sbp
+                signature is (S(0), B),
+            encoder_states: encoder output with shape (batch_size, seq_length, hidden_size)
+                and the sbp signature is (S(0), B), which will be used in cross attention.
+            encoder_attention_mask: key padding mask of encoder states with shape
+                (batch_size, 1, seq_length, seq_length) and the sbp signature is (S(0), B).
+            past_key_value: tuple of key and value, each shape is
+                (seq_length, bsz, num_heads, head_size), For decoder layer,
+                the past_key_value contains the states both from self attention
+                and cross attention.
+            use_cache: it will be set to `True` when the model is in the inference phase and
+                used for incremental decoding.
+        """
+        # Change placement for pipeline parallelsim
+        hidden_states = hidden_states.to_global(placement=dist.get_layer_placement(self.layer_idx))
+
+        # hidden_states shape: (batch_size, seq_length, hidden_size)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to_global(
+                placement=dist.get_layer_placement(self.layer_idx)
+            )
+
+        if past_key_value is not None:
+            if self.is_decoder:
+                assert len(past_key_value) == 4
+                self_attn_past_key_value = past_key_value[:2]
+                cross_attn_past_key_value = past_key_value[2:]
+            else:
+                self_attn_past_key_value = past_key_value
+                cross_attn_past_key_value = None
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        layernorm_output = self.input_layernorm(hidden_states)
+        attention_output = self.self_attention(
+            layernorm_output,
+            attention_mask=attention_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+        )
+        attention_output = self.drop_path(attention_output)
+
+        if use_cache:
+            attention_output, presents = attention_output
+
+        if self.apply_residual_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        hidden_states = residual + attention_output
+
+        layernorm_output = self.post_attention_layernorm(hidden_states)
+
+        if self.is_decoder:
+            attention_output = self.cross_attention(
+                layernorm_output,
+                encoder_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                use_cache=use_cache,
+            )
+
+            if use_cache:
+                attention_output, decoder_presents = attention_output
+                presents += decoder_presents
+
+            attention_output = self.drop_path(attention_output)
+            if self.apply_residual_post_layernorm:
+                residual = layernorm_output
+            else:
+                residual = hidden_states
+
+            hidden_states = residual + attention_output
+            layernorm_output = self.post_cross_attention_layernorm(hidden_states)
+
+        mlp_output = self.mlp(layernorm_output)
+        mlp_output = self.drop_path(mlp_output)
+
+        if self.apply_residual_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        output = residual + mlp_output
+
+        if use_cache:
+            output = (output, presents)
+        return output
+
+    def build_attention(self, is_cross_attention=False):
+        return MultiheadAttention(
+            self.hidden_size,
+            self.num_attention_heads,
+            is_cross_attention=is_cross_attention,
+            attention_dropout_prob=self.attention_dropout_prob,
+            output_dropout_prob=self.output_dropout_prob,
+            init_method=self.init_method,
+            output_layer_init_method=self.output_layer_init_method,
+            bias_dropout_fusion=self.bias_dropout_fusion,
+            scale_mask_softmax_fusion=self.scale_mask_softmax_fusion,
+            apply_query_key_layer_scaling=self.apply_query_key_layer_scaling,
+            attn_mask_type=self.attn_mask_type,
+            layer_idx=self.layer_idx,
+        )
--- a/libai/models/__init__.py
+++ b/libai/models/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .bert_model import BertForPreTraining, BertModel, BertForClassification
+from .roberta_model import RobertaForPreTraining, RobertaForCausalLM, RobertaModel
+from .build import build_graph, build_model
+from .t5_model import T5ForPreTraining, T5Model
+from .gpt_model import GPTForPreTraining, GPTModel
+from .vision_transformer import VisionTransformer
+from .swin_transformer import SwinTransformer
+from .swin_transformer_v2 import SwinTransformerV2
+from .resmlp import ResMLP
+
+__all__ = [
+    "build_model",
+    "build_graph",
+    "BertModel",
+    "BertForPreTraining",
+    "BertForClassification",
+    "RobertaModel",
+    "RobertaForCausalLM",
+    "RobertaForPreTraining",
+    "T5Model",
+    "T5ForPreTraining",
+    "GPTModel",
+    "GPTForPreTraining",
+    "VisionTransformer",
+    "SwinTransformer",
+    "SwinTransformerV2",
+    "ResMLP",
+]
--- a/libai/models/__pycache__/__init__.cpython-39.pyc
+++ b/libai/models/__pycache__/__init__.cpython-39.pyc
--- a/libai/models/__pycache__/bert_model.cpython-39.pyc
+++ b/libai/models/__pycache__/bert_model.cpython-39.pyc
--- a/libai/models/__pycache__/build.cpython-39.pyc
+++ b/libai/models/__pycache__/build.cpython-39.pyc
--- a/libai/models/__pycache__/gpt_model.cpython-39.pyc
+++ b/libai/models/__pycache__/gpt_model.cpython-39.pyc
--- a/libai/models/__pycache__/resmlp.cpython-39.pyc
+++ b/libai/models/__pycache__/resmlp.cpython-39.pyc
--- a/libai/models/__pycache__/roberta_model.cpython-39.pyc
+++ b/libai/models/__pycache__/roberta_model.cpython-39.pyc
--- a/libai/models/__pycache__/swin_transformer.cpython-39.pyc
+++ b/libai/models/__pycache__/swin_transformer.cpython-39.pyc
--- a/libai/models/__pycache__/swin_transformer_v2.cpython-39.pyc
+++ b/libai/models/__pycache__/swin_transformer_v2.cpython-39.pyc
--- a/libai/models/__pycache__/t5_model.cpython-39.pyc
+++ b/libai/models/__pycache__/t5_model.cpython-39.pyc
--- a/libai/models/__pycache__/vision_transformer.cpython-39.pyc
+++ b/libai/models/__pycache__/vision_transformer.cpython-39.pyc
--- a/libai/models/bert_model.py
+++ b/libai/models/bert_model.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.config import configurable
+from libai.layers import (
+    Embedding,
+    LayerNorm,
+    Linear,
+    LMLogits,
+    ParallelCrossEntropyLoss,
+    TransformerLayer,
+    VocabEmbedding,
+    build_activation,
+)
+from libai.layers.attention import AttnMaskType
+from libai.utils import distributed as dist
+
+from .utils import init_method_normal, scaled_init_method_normal
+
+
+class BertExtendedAttnMask(nn.Module):
+    def forward(self, attention_mask):
+        # We create a 3D attention mask from a 2D tensor mask.
+        # [b, 1, s]
+        attention_mask_b1s = attention_mask.unsqueeze(1)
+        # [b, s, 1]
+        attention_mask_bs1 = attention_mask.unsqueeze(2)
+        # [b, s, s]
+        attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+        # [b, 1, s, s]
+        extended_attention_mask = attention_mask_bss.unsqueeze(1)
+
+        return extended_attention_mask
+
+
+class BertEmbeddings(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        num_tokentypes=0,
+        init_method=nn.init.xavier_normal_,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        self.vocab_embeddings = VocabEmbedding(
+            vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.position_embeddings = Embedding(
+            max_sequence_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+
+        # NOTE(l1aoxingyu): Set position_ids sbp sign to [B, B] initially, because position_ids is a
+        # 1D-tensor from 0 to seq_length, if set to [S(0), B] at first, then position_ids
+        # will split at the first dim of hierarchy.
+        self.position_ids = flow.arange(
+            max_sequence_length,
+            dtype=flow.long,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(0),
+        ).unsqueeze(0)
+
+        if num_tokentypes > 0:
+            self.tokentype_embeddings = Embedding(
+                num_tokentypes, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+            )
+            self.tokentype_ids = flow.zeros(
+                self.position_ids.size(),
+                dtype=flow.long,
+                sbp=self.position_ids.sbp,
+                placement=self.position_ids.placement,
+            )
+        else:
+            self.tokentype_embeddings = None
+
+        self.embedding_dropout = nn.Dropout(embedding_dropout_prob)
+
+    def forward(self, input_ids, tokentype_ids=None, position_ids=None):
+        seq_length = input_ids.size()[1]
+
+        word_embeddings = self.vocab_embeddings(input_ids)
+        if position_ids is None:
+            # Change position_ids sbp sign: [B, B] -> [S(0), B]
+            position_ids = (
+                self.position_ids[:, :seq_length].expand_as(input_ids).to_global(sbp=input_ids.sbp)
+            )
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = word_embeddings + position_embeddings
+
+        if self.tokentype_embeddings is not None:
+            if tokentype_ids is None:
+                tokentype_ids = (
+                    self.tokentype_ids[:, :seq_length]
+                    .expand_as(input_ids)
+                    .to_global(sbp=input_ids.sbp)
+                )
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+
+        embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def word_embeddings(self):
+        return self.vocab_embeddings.weight
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, hidden_size, init_method):
+        super().__init__()
+        self.dense = Linear(
+            hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="data",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.activation_func = build_activation("gelu")
+        self.layernorm = LayerNorm((hidden_size,), layer_idx=-1)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation_func(hidden_states)
+        hidden_states = hidden_states.to_global(
+            grad_sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(2)])
+        )
+
+        # NOTE(l1aoxingyu): hidden_states shape is [B, S, H] whose sbp sign: [S(0), S(2)]
+        # Change from [S(0), S(2)] -> [S(0), B] because layernorm cannot get inputs with sbp S(2)
+        hidden_states = hidden_states.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])
+        )
+        hidden_states = self.layernorm(hidden_states)
+        return hidden_states
+
+
+class BertPooler(nn.Module):
+    """Pooler layer.
+
+    Pool hidden states of the first token and
+    add a linear transformation followed by a tanh.
+
+    Args:
+        hidden_size: hidden state feature dimension
+    """
+
+    def __init__(self, hidden_size, init_method):
+        super().__init__()
+        self.dense = Linear(
+            hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="col",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.activation_func = build_activation("tanh")
+
+    def forward(self, hidden_states):
+        """Just "pool" the model by simply taking the [CLS] token corresponding
+        to the first token."""
+        # hidden_states: [bsz, seq_len, hidden_size]
+        select_token_tensor = hidden_states[:, 0, :]
+        pooled_output = self.dense(select_token_tensor)
+        pooled_output = self.activation_func(pooled_output)
+        return pooled_output
+
+
+class BertLoss(nn.Module):
+    def __init__(self, add_binary_head):
+        super().__init__()
+        self.add_binary_head = add_binary_head
+        self.lm_loss = ParallelCrossEntropyLoss()
+
+    def forward(self, lm_output, lm_labels, loss_mask, binary_logits, ns_labels):
+        lm_labels = lm_labels.to_global(placement=lm_output.placement)
+        loss_mask = loss_mask.to_global(placement=lm_output.placement)
+        binary_logits = binary_logits.to_global(placement=lm_output.placement)
+        ns_labels = ns_labels.to_global(placement=lm_output.placement)
+        lm_loss = self.lm_loss(lm_output, lm_labels)
+        loss_mask = loss_mask.float()
+        # Change loss_mask.sum() sbp sign from [P, B] -> [B, B]
+        # because (lm_loss * loss_mask) / loss_mask.sum() cannot accept P / P
+        denominator = (
+            loss_mask.sum().to_global(sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
+            + 1e-7
+        )
+        masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator
+        # NOTE(l1aoxingyu): Change lm loss sbp sign [P, P] -> [P, B] to add with sop loss
+        # whose sbp sign: [P, B]
+        masked_lm_loss = masked_lm_loss.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast])
+        )
+
+        loss_dict = {"lm_loss": masked_lm_loss}
+
+        if self.add_binary_head:
+            sop_loss = flow._C.cross_entropy(
+                binary_logits, ns_labels, ignore_index=-1, reduction="none"
+            ).mean()
+            loss_dict["sop_loss"] = sop_loss
+        return loss_dict
+
+
+class BertModel(nn.Module):
+    """The bare Bert Model transformer outputting raw hidden-states without
+    any specific head on top.
+
+    Args:
+        vocab_size (int): The size of vocabulary file.
+        hidden_size (int): The size of hidden states.
+        hidden_layers (int): The number of ``TransformerLayer`` in encoder.
+        num_attention_heads (int):
+            The number of attention heads for each attention layer of ``TransformerLayer``.
+        intermediate_size (int):
+            The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
+        hidden_dropout_prob  (float, optional):
+            The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
+        attention_probs_dropout_prob  (float, optional):
+            The dropout ratio for the output of each attention layer in ``TransformerLayer``.
+            Defaults to 0.0.
+        max_position_embeddings (int):
+            Max sequence length of input, defines the shape of Position Embeddings
+            in ``BertEmbedding``.
+        num_tokentypes (int, optional):
+            Number of segment token indices. Defaults to 2.
+        add_pooling_layer (bool, optional):
+            Whether or not averaging or pooling the sequence of hidden-states for the
+            whole input sequence. Defaults to ``True``.
+        initializer_range (float, optional):
+            Sigma of the normal distribution in the initialization method. Defaults to 0.02.
+        layernorm_epsilon (float, optional):
+            The epsilon of LayerNorm layer. Defaults to 1e-5.
+        bias_gelu_fusion (bool, optional):
+            Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
+        bias_dropout_fusion (bool, optional):
+            Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
+        scale_mask_softmax_fusion (bool, optional):
+            Whether to fuse the computing of mask and softmax in attention layers.
+            Defaults to ``False``.
+        apply_query_key_layer_scaling (bool, optional):
+            Whether or not to use layer index related scaling in computing attention scores.
+            If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
+            Defaults to ``True``.
+        apply_residual_post_layernorm (bool, optional):
+            If set ``True``, use original BERT residual connection ordering otherwise use Megatron
+            BERT residual connection which is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        amp_enabled (bool, optional):
+            Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        hidden_layers,
+        num_attention_heads,
+        intermediate_size,
+        hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        max_position_embeddings,
+        num_tokentypes=2,
+        add_pooling_layer=True,
+        initializer_range=0.02,
+        layernorm_eps=1e-12,
+        bias_gelu_fusion=True,
+        bias_dropout_fusion=True,
+        scale_mask_softmax_fusion=True,
+        apply_query_key_layer_scaling=True,
+        apply_residual_post_layernorm=False,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        init_method = init_method_normal(initializer_range)
+        scaled_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
+
+        # Embeddings
+        self.embeddings = BertEmbeddings(
+            vocab_size,
+            hidden_size,
+            max_position_embeddings,
+            hidden_dropout_prob,
+            num_tokentypes,
+            init_method,
+            amp_enabled,
+        )
+
+        # Mask generation
+        self.extended_attn_mask = BertExtendedAttnMask()
+
+        # Encoders
+        self.encoders = nn.ModuleList(
+            [
+                TransformerLayer(
+                    hidden_size,
+                    intermediate_size,
+                    num_attention_heads,
+                    attention_dropout_prob=attention_probs_dropout_prob,
+                    output_dropout_prob=hidden_dropout_prob,
+                    layernorm_epsilon=layernorm_eps,
+                    bias_gelu_fusion=bias_gelu_fusion,
+                    bias_dropout_fusion=bias_dropout_fusion,
+                    scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+                    apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+                    init_method=init_method,
+                    output_layer_init_method=scaled_init_method,
+                    apply_residual_post_layernorm=apply_residual_post_layernorm,
+                    attn_mask_type=AttnMaskType.padding,  # bert mask type
+                    layer_idx=i,
+                )
+                for i in range(hidden_layers)
+            ]
+        )
+        self.final_layernorm = LayerNorm((hidden_size,), eps=layernorm_eps, layer_idx=-1)
+
+        self.pooler = BertPooler(hidden_size, init_method) if add_pooling_layer else None
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "hidden_layers": cfg.hidden_layers,
+            "num_attention_heads": cfg.num_attention_heads,
+            "intermediate_size": cfg.intermediate_size,
+            "hidden_dropout_prob": cfg.hidden_dropout_prob,
+            "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
+            "max_position_embeddings": cfg.max_position_embeddings,
+            "num_tokentypes": cfg.num_tokentypes,
+            "add_pooling_layer": cfg.add_pooling_layer,
+            "initializer_range": cfg.initializer_range,
+            "layernorm_eps": cfg.layernorm_eps,
+            "bias_gelu_fusion": cfg.bias_gelu_fusion,
+            "bias_dropout_fusion": cfg.bias_dropout_fusion,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
+            "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
+            "amp_enabled": cfg.amp_enabled,
+        }
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention
+                on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first and
+                second portions of the inputs. Indices are selected in `[0, 1]`. Defaults to None.
+        """
+        extended_attention_mask = self.extended_attn_mask(attention_mask)
+        embedding_output = self.embeddings(input_ids, tokentype_ids)
+
+        hidden_states = embedding_output
+        for layer in self.encoders:
+            hidden_states = layer(hidden_states, extended_attention_mask)
+        encoder_output = self.final_layernorm(hidden_states)
+        pooled_output = self.pooler(encoder_output) if self.pooler is not None else None
+        return encoder_output, pooled_output
+
+    def word_embeddings_weight(self):
+        return self.embeddings.word_embeddings()
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, vocab_size, hidden_size, init_method, add_binary_head=True):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(hidden_size, init_method)
+        self.seq_relationship = Linear(
+            hidden_size,
+            2,
+            bias=True,
+            parallel="data",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.lm_logits = LMLogits(vocab_size, bias=True)
+        self.loss_func = BertLoss(add_binary_head)
+
+    def forward(
+        self,
+        sequence_output,
+        pooled_output,
+        word_embeddings_weight,
+        ns_labels,
+        lm_labels,
+        loss_mask,
+    ):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        prediction_scores = self.lm_logits(prediction_scores, word_embeddings_weight)
+
+        if lm_labels is not None:
+            return self.loss_func(
+                prediction_scores, lm_labels, loss_mask, seq_relationship_score, ns_labels
+            )
+        return {
+            "prediction_scores": prediction_scores,
+            "seq_relationship_score": seq_relationship_score,
+        }
+
+
+class BertForPreTraining(nn.Module):
+    """Bert Model with two heads on top as done during the pretraining: a
+    `masked language modeling` head and a `next sentence prediction (classification)` head.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.bert = BertModel(cfg)
+        self.cls_head = BertPreTrainingHeads(
+            cfg.vocab_size,
+            cfg.hidden_size,
+            init_method_normal(cfg.initializer_range),
+            cfg.add_binary_head,
+        )
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        tokentype_ids=None,
+        ns_labels=None,
+        lm_labels=None,
+        loss_mask=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention on
+                padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
+                and second portions of the inputs. Indices are selected in `[0, 1]`.
+                Defaults to None.
+            ns_labels (flow.LongTensor, optional): Labels for computing the next sequence prediction
+                (classification) loss. Input should be a sequence pair (see `input_ids` docstring).
+                Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+
+            lm_labels (flow.LongTensor, optional): Labels for computing the masked
+                language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
+            loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
+                on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        attention_mask = attention_mask.to_global(placement=dist.get_layer_placement(0))
+        tokentype_ids = tokentype_ids.to_global(placement=dist.get_layer_placement(0))
+        outputs = self.bert(input_ids, attention_mask, tokentype_ids)
+        sequence_output, pooled_output = outputs[:2]
+
+        return self.cls_head(
+            sequence_output,
+            pooled_output,
+            self.bert.word_embeddings_weight(),
+            ns_labels,
+            lm_labels,
+            loss_mask,
+        )
+
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.bert.final_layernorm, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.origin, BertEmbeddings):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, BertExtendedAttnMask):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.origin, BertPooler):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.origin, BertPreTrainingHeads):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.bert.final_layernorm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), BertEmbeddings):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), BertExtendedAttnMask):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.to(nn.Module), BertPooler):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.to(nn.Module), BertPreTrainingHeads):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.bert.final_layernorm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+
+class BertForClassification(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.num_labels = cfg.num_labels
+
+        self.bert = BertModel(cfg)
+        self.classifier = Linear(
+            cfg.hidden_size,
+            cfg.num_labels,
+            bias=True,
+            parallel="row",
+            init_method=init_method_normal(cfg.initializer_range),
+            layer_idx=-1,
+        )
+        classifier_dropout = (
+            cfg.classifier_dropout
+            if cfg.classifier_dropout is not None
+            else cfg.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None, labels=None, **kwargs):
+        labels = labels if labels is not None else kwargs.get("ns_labels")
+        outputs = self.bert(input_ids, attention_mask, tokentype_ids)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            loss = loss.to_global(sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast]))
+            return {"cls_loss": loss}
+        else:
+            return {"logits": logits}
--- a/libai/models/build.py
+++ b/libai/models/build.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libai.config import instantiate, try_get_key
+
+
+def build_model(cfg):
+    """Build the whole model architecture, defined by ``cfg.model``.
+    Note that it does not load any weights from ``cfg``.
+    """
+    model = instantiate(cfg)
+    return model
+
+
+def build_graph(cfg, model, optimizer=None, lr_scheduler=None, is_train=False):
+    """Build the `nn.Graph`, defined by ``cfg.graph``."""
+    auto_parallel_conf = try_get_key(cfg, "graph.auto_parallel", default=None)
+    if is_train:
+        # Set train graph
+        assert optimizer is not None, "optimizer must be set for train graph"
+        assert lr_scheduler is not None, "lr_scheduler must be set for train graph"
+        graph = cfg.graph.train_graph
+        graph.model = model
+        graph.optimizer = optimizer
+        graph.lr_scheduler = lr_scheduler
+        graph.fp16 = try_get_key(cfg, "train.amp.enabled", default=False)
+        graph.activation_checkpoint = try_get_key(
+            cfg, "train.activation_checkpoint.enabled", default=False
+        )
+        graph.zero_optim = try_get_key(cfg, "train.zero_optimization.enabled", default=False)
+        graph.zero_stage = try_get_key(cfg, "train.zero_optimization.stage", default=1)
+        graph.grad_acc_steps = try_get_key(cfg, "train.num_accumulation_steps", default=1)
+        graph.auto_parallel_conf = auto_parallel_conf
+        return instantiate(graph)
+    else:
+        # Set eval graph
+        graph = cfg.graph.eval_graph
+        graph.model = model
+        graph.auto_parallel_conf = auto_parallel_conf
+        return instantiate(graph)
--- a/libai/models/gpt_model.py
+++ b/libai/models/gpt_model.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+from oneflow.nn import init
+
+from libai.config import configurable
+from libai.layers import (
+    Embedding,
+    LayerNorm,
+    LMLogits,
+    ParallelCrossEntropyLoss,
+    TransformerLayer,
+    VocabEmbedding,
+)
+from libai.layers.attention import AttnMaskType
+from libai.utils import distributed as dist
+
+from .utils import init_method_normal, scaled_init_method_normal
+
+
+class CasualMask(nn.Module):
+    """
+    Create a casual mask and combine it with the padding mask.
+    It will be used in gpt model and T5 decoder.
+    When in T5 decoder, the argument `layer_idx` should be set to first decoder layer index.
+    """
+
+    def __init__(self, max_positions=1024, *, layer_idx=0):
+        super().__init__()
+        self.mask = flow.tril(
+            flow.ones(
+                (max_positions, max_positions),
+                dtype=flow.int8,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+
+    def forward(self, input_ids, past_length=0, attention_mask=None):
+        bsz, tgt_len = input_ids.size()
+        casual_mask = self.mask[:tgt_len, :tgt_len]
+        if past_length > 0:
+            # in case past_key_values are used, we need to add a prefix ones mask to casual mask
+            casual_mask = flow.cat(
+                [flow.ones(tgt_len, past_length, dtype=flow.int8), casual_mask], dim=-1
+            )
+        casual_mask = (
+            casual_mask.unsqueeze(0).unsqueeze(1).expand(bsz, 1, tgt_len, tgt_len + past_length)
+        )
+        casual_mask = casual_mask.to_global(sbp=input_ids.sbp)
+        if attention_mask is not None:
+            assert attention_mask.dim() == 4, "please extend the attention mask first"
+            casual_mask = casual_mask * attention_mask
+        return casual_mask
+
+
+class GPTModel(nn.Module):
+    """GPT-2 language model. The output of the forward method is logits.
+
+    Args:
+        hidden_layers (int): The number of ``TransformerLayer`` in the gpt model.
+        vocab_size (int): The size of vocabulary file.
+        hidden_size (int): The size of hidden states.
+        ffn_hidden_size (int):
+            The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
+        num_attention_heads (int):
+            The number of attention heads for each attention layer of ``TransformerLayer``.
+        max_seq_length (int, optional):
+            Max sequence length of input, defines the shape of Position Embeddings in GPTEmebedding.
+            Defaults to 1024.
+        embedding_dropout_prob (float, optional):
+            The dropout ratio for the output of GPTEmbedding Layer. Defaults to 0.0.
+        attention_dropout_prob (float, optional):
+            The dropout ratio for the output of each attention layer in ``TransformerLayer``.
+            Defaults to 0.0.
+        output_dropout_prob (float, optional):
+            The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
+        layernorm_epsilon (float, optional):
+            The epsilon of LayerNorm layer. Defaults to 1e-5.
+        initializer_range (float, optional):
+            Sigma of the normal distribution in the initialization method. Defaults to 0.02.
+        use_scaled_init_for_output_weights (bool, optional): Defaults to ``True``.
+        bias_gelu_fusion (bool, optional):
+            Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
+        bias_dropout_fusion (bool, optional):
+            Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
+        scale_mask_softmax_fusion (bool, optional):
+            Whether to fuse the computing of mask and softmax in attention layers.
+            Defaults to ``False``.
+        apply_query_key_layer_scaling (bool, optional):
+            Whether or not to use layer index related scaling in computing attention scores.
+            If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
+            Defaults to ``False``.
+        apply_residual_post_layernorm (bool, optional):
+            If set ``True``, use original BERT residual connection ordering otherwise use Megatron
+            BERT residual connection which is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        amp_enabled (bool, optional):
+            Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        hidden_layers,
+        vocab_size,
+        hidden_size,
+        ffn_hidden_size,
+        num_attention_heads,
+        max_seq_length=1024,
+        embedding_dropout_prob=0.0,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        layernorm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_scaled_init_for_output_weights=True,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        apply_residual_post_layernorm=False,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        init_method = init_method_normal(sigma=initializer_range)
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
+        else:
+            output_layer_init_method = init_method
+
+        self.embeddings = GPTEmbedding(
+            vocab_size,
+            hidden_size,
+            max_seq_length,
+            init_method=init_method,
+            embedding_dropout_prob=embedding_dropout_prob,
+            amp_enabled=amp_enabled,
+        )
+
+        self.transformer = Transformer(
+            hidden_layers,
+            hidden_size,
+            ffn_hidden_size,
+            num_attention_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            output_dropout_prob=output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            init_method=init_method,
+            output_layer_init_method=output_layer_init_method,
+            bias_gelu_fusion=bias_gelu_fusion,
+            bias_dropout_fusion=bias_dropout_fusion,
+            scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+            apply_residual_post_layernorm=apply_residual_post_layernorm,
+        )
+
+        self.lm_head = LMLogits(vocab_size, bias=False)
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "hidden_layers": cfg.hidden_layers,
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "ffn_hidden_size": cfg.ffn_hidden_size,
+            "num_attention_heads": cfg.num_attention_heads,
+            "max_seq_length": cfg.max_seq_length,
+            "embedding_dropout_prob": cfg.embedding_dropout_prob,
+            "attention_dropout_prob": cfg.attention_dropout_prob,
+            "output_dropout_prob": cfg.output_dropout_prob,
+            "layernorm_epsilon": cfg.layernorm_epsilon,
+            "initializer_range": cfg.initializer_range,
+            "use_scaled_init_for_output_weights": cfg.use_scaled_init_for_output_weights,
+            "bias_gelu_fusion": cfg.bias_gelu_fusion,
+            "bias_dropout_fusion": cfg.bias_dropout_fusion,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
+            "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
+            "amp_enabled": cfg.amp_enabled,
+        }
+
+    def forward(self, input_ids):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+
+        Returns:
+            flow.Tensor: logits
+        """
+
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        input_embeds = self.embeddings(input_ids, 0)
+
+        transformer_output = self.transformer(input_embeds, attention_mask=None)
+
+        output = self.lm_head(transformer_output, self.embeddings.token_embeddings.weight)
+
+        return output
+
+
+class GPTEmbedding(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_seq_length,
+        init_method=init.xavier_normal_,
+        embedding_dropout_prob=0.0,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        self.token_embeddings = VocabEmbedding(
+            vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.position_embeddings = Embedding(
+            max_seq_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.dropout = nn.Dropout(embedding_dropout_prob)
+
+        self.position_ids = flow.arange(
+            max_seq_length,
+            dtype=flow.long,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(0),
+        ).unsqueeze(0)
+
+    def forward(self, input_ids, past_length=0):
+        bsz, seq_length = input_ids.size()
+
+        position_ids = self.position_ids[:, past_length : past_length + seq_length]
+        position_ids = position_ids.expand_as(input_ids).to_global(sbp=input_ids.sbp)
+
+        token_embeds = self.token_embeddings(input_ids)
+        position_embeds = self.position_embeddings(position_ids)
+        input_embeds = token_embeds + position_embeds
+        input_embeds = self.dropout(input_embeds)
+        return input_embeds
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        hidden_layers,
+        hidden_size,
+        ffn_hidden_size,
+        num_attention_heads,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        layernorm_epsilon=1e-5,
+        init_method=init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        apply_residual_post_layernorm=False,
+    ):
+        super().__init__()
+        self.hidden_layers = hidden_layers
+
+        def build_layer(layer_number):
+            return TransformerLayer(
+                hidden_size,
+                ffn_hidden_size,
+                num_attention_heads,
+                attention_dropout_prob=attention_dropout_prob,
+                output_dropout_prob=output_dropout_prob,
+                layernorm_epsilon=layernorm_epsilon,
+                init_method=init_method,
+                output_layer_init_method=output_layer_init_method,
+                bias_gelu_fusion=bias_gelu_fusion,
+                bias_dropout_fusion=bias_dropout_fusion,
+                scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+                apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+                apply_residual_post_layernorm=apply_residual_post_layernorm,
+                attn_mask_type=AttnMaskType.causal,
+                layer_idx=layer_number,
+            )
+
+        self.layers = nn.ModuleList([build_layer(i) for i in range(self.hidden_layers)])
+        self.layernorm_f = LayerNorm(hidden_size, eps=layernorm_epsilon, layer_idx=-1)
+
+    def forward(self, hidden_states, attention_mask):
+        # hidden_states shape: (batch_size, seq_length, hidden_size)
+        # sbp: [S(0), B]
+        for i, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states, attention_mask)
+
+        output = self.layernorm_f(hidden_states)
+
+        return output
+
+
+class GPTLoss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.lm_loss = ParallelCrossEntropyLoss()
+
+    def forward(self, logits, lm_labels):
+        lm_loss = self.lm_loss(logits, lm_labels)
+        lm_loss = lm_loss.mean()
+        return {"lm_loss": lm_loss}
+
+
+class GPTForPreTraining(nn.Module):
+    """
+    GPT Model with classification head on top.
+    """
+
+    def __init__(self, cfg) -> None:
+        super().__init__()
+        self.GPT_model = GPTModel(cfg)
+        self.loss_func = GPTLoss()
+
+    def forward(
+        self,
+        input_ids,
+        labels=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            labels (flow.LongTensor, optional): Labels for computing language modeling loss.
+                None for evaluating. Defaults to None.
+
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation.
+                :code:`{"masked_lm_loss": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        logits = self.GPT_model(input_ids)
+        if labels is not None:
+            lm_loss = self.loss_func(logits, labels)
+            return lm_loss
+        else:
+            return {"prediction_scores": logits}
+
+    @staticmethod
+    def set_pipeline_stage_id(model: nn.Module):
+        dist_utils = dist.get_dist_util()
+
+        if hasattr(model.GPT_model.transformer.layernorm_f, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                if isinstance(module_block.origin, (GPTEmbedding, CasualMask)):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.origin, (LMLogits, GPTLoss)):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            model.GPT_model.transformer.layernorm_f.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), (GPTEmbedding, CasualMask)):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.to(nn.Module), (LMLogits, GPTLoss)):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            model.GPT_model.transformer.layernorm_f.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
--- a/libai/models/resmlp.py
+++ b/libai/models/resmlp.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# --------------------------------------------------------
+# ResMLP Model
+# References:
+# resmlp: https://github.com/facebookresearch/deit/blob/main/resmlp_models.py
+# --------------------------------------------------------
+
+import oneflow as flow
+import oneflow.nn as nn
+from flowvision.layers.weight_init import trunc_normal_
+
+import libai.utils.distributed as dist
+from libai.config import configurable
+from libai.layers import MLP, DropPath, LayerNorm, Linear, PatchEmbedding
+
+
+class Affine(nn.Module):
+    def __init__(self, dim, *, layer_idx=0):
+        super().__init__()
+        self.alpha = nn.Parameter(
+            flow.ones(
+                dim,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+        self.beta = nn.Parameter(
+            flow.zeros(
+                dim,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            ),
+        )
+
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
+        return self.alpha * x + self.beta
+
+
+class layers_scale_mlp_blocks(nn.Module):
+    def __init__(
+        self, dim, drop=0.0, drop_path=0.0, init_values=1e-4, num_patches=196, *, layer_idx=0
+    ):
+        super().__init__()
+        self.norm1 = Affine(dim, layer_idx=layer_idx)
+        self.attn = Linear(num_patches, num_patches, layer_idx=layer_idx)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = Affine(dim, layer_idx=layer_idx)
+        self.mlp = MLP(hidden_size=dim, ffn_hidden_size=int(4.0 * dim), layer_idx=layer_idx)
+        self.gamma_1 = nn.Parameter(
+            init_values
+            * flow.ones(
+                dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(layer_idx),
+            ),
+            requires_grad=True,
+        )
+        self.gamma_2 = nn.Parameter(
+            init_values
+            * flow.ones(
+                dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(layer_idx),
+            ),
+            requires_grad=True,
+        )
+
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
+        x = x + self.drop_path(
+            self.gamma_1 * self.attn(self.norm1(x).transpose(1, 2)).transpose(1, 2)
+        )
+        x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class ResMLP(nn.Module):
+    """ResMLP in LiBai.
+
+    LiBai's implementation of:
+    `ResMLP: Feedforward networks for image classification with data-efficient training
+    <https://arxiv.org/abs/2105.03404>`_
+
+    Args:
+        img_size (int, tuple(int)): input image size
+        patch_size (int, tuple(int)): patch size
+        in_chans (int): number of input channels
+        embed_dim (int): embedding dimension
+        depth (int): depth of transformer
+        drop_rate (float): dropout rate
+        drop_path_rate (float): stochastic depth rate
+        init_scale (float): the layer scale ratio
+        num_classes (int): number of classes for classification head
+        loss_func (callable, optional): loss function for computing the total loss
+                                        between logits and labels
+
+    """
+
+    @configurable
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        drop_rate=0.0,
+        drop_path_rate=0.0,
+        init_scale=1e-4,
+        num_classes=1000,
+        loss_func=None,
+    ):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbedding(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        num_patches = self.patch_embed.num_patches
+        dpr = [drop_path_rate for i in range(depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList(
+            [
+                layers_scale_mlp_blocks(
+                    dim=embed_dim,
+                    drop=drop_rate,
+                    drop_path=dpr[i],
+                    init_values=init_scale,
+                    num_patches=num_patches,
+                    layer_idx=i,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        self.norm = Affine(embed_dim, layer_idx=-1)
+        self.head = (
+            Linear(embed_dim, num_classes, layer_idx=-1) if num_classes > 0 else nn.Identity()
+        )
+
+        # loss func
+        self.loss_func = nn.CrossEntropyLoss() if loss_func is None else loss_func
+
+        # weight init
+        self.apply(self._init_weights)
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "img_size": cfg.img_size,
+            "patch_size": cfg.patch_size,
+            "in_chans": cfg.in_chans,
+            "embed_dim": cfg.embed_dim,
+            "depth": cfg.depth,
+            "drop_rate": cfg.drop_rate,
+            "drop_path_rate": cfg.drop_path_rate,
+            "init_scale": cfg.init_scale,
+            "num_classes": cfg.num_classes,
+            "loss_func": cfg.loss_func,
+        }
+
+    def _init_weights(self, m):
+        if isinstance(m, Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+
+        # layer scale mlp blocks
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+
+        return x
+
+    def forward_head(self, x):
+        B = x.shape[0]
+        x = self.norm(x)
+        x = x.mean(dim=1).reshape(B, 1, -1)
+        return self.head(x[:, 0])
+
+    def forward(self, images, labels=None):
+        """
+
+        Args:
+            images (flow.Tensor): training samples.
+            labels (flow.LongTensor, optional): training targets
+
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation mode.
+                :code:`{"losses": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        x = self.forward_features(images)
+        x = self.forward_head(x)
+
+        if labels is not None and self.training:
+            losses = self.loss_func(x, labels)
+            return {"losses": losses}
+        else:
+            return {"prediction_scores": x}
+
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.loss_func, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.origin, PatchEmbedding):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, layers_scale_mlp_blocks):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            # Set norm and head stage id
+            model.norm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), PatchEmbedding):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            # Set norm and head stage id
+            model.norm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+    @staticmethod
+    def set_activation_checkpoint(model):
+        for module_block in model.modules():
+            if hasattr(module_block, "origin"):
+                # Old API in OneFlow 0.8
+                if isinstance(module_block.origin, layers_scale_mlp_blocks):
+                    module_block.config.activation_checkpointing = True
+            else:
+                if isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
+                    module_block.to(nn.graph.GraphModule).activation_checkpointing = True