bert-large

5988d2cc · yuguo960516 · 478602ba · 5988d2cc · 5988d2cc · 5988d2cc
Commit 5988d2cc authored Mar 29, 2023 by yuguo960516
20 changed files
--- a/libai/models/__init__.py
+++ b/libai/models/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .bert_model import BertForPreTraining, BertModel, BertForClassification
+from .roberta_model import RobertaForPreTraining, RobertaForCausalLM, RobertaModel
+from .build import build_graph, build_model
+from .t5_model import T5ForPreTraining, T5Model
+from .gpt_model import GPTForPreTraining, GPTModel
+from .vision_transformer import VisionTransformer
+from .swin_transformer import SwinTransformer
+from .swin_transformer_v2 import SwinTransformerV2
+from .resmlp import ResMLP
+
+__all__ = [
+    "build_model",
+    "build_graph",
+    "BertModel",
+    "BertForPreTraining",
+    "BertForClassification",
+    "RobertaModel",
+    "RobertaForCausalLM",
+    "RobertaForPreTraining",
+    "T5Model",
+    "T5ForPreTraining",
+    "GPTModel",
+    "GPTForPreTraining",
+    "VisionTransformer",
+    "SwinTransformer",
+    "SwinTransformerV2",
+    "ResMLP",
+]
--- a/libai/models/__pycache__/__init__.cpython-39.pyc
+++ b/libai/models/__pycache__/__init__.cpython-39.pyc
--- a/libai/models/__pycache__/bert_model.cpython-39.pyc
+++ b/libai/models/__pycache__/bert_model.cpython-39.pyc
--- a/libai/models/__pycache__/build.cpython-39.pyc
+++ b/libai/models/__pycache__/build.cpython-39.pyc
--- a/libai/models/__pycache__/gpt_model.cpython-39.pyc
+++ b/libai/models/__pycache__/gpt_model.cpython-39.pyc
--- a/libai/models/__pycache__/resmlp.cpython-39.pyc
+++ b/libai/models/__pycache__/resmlp.cpython-39.pyc
--- a/libai/models/__pycache__/roberta_model.cpython-39.pyc
+++ b/libai/models/__pycache__/roberta_model.cpython-39.pyc
--- a/libai/models/__pycache__/swin_transformer.cpython-39.pyc
+++ b/libai/models/__pycache__/swin_transformer.cpython-39.pyc
--- a/libai/models/__pycache__/swin_transformer_v2.cpython-39.pyc
+++ b/libai/models/__pycache__/swin_transformer_v2.cpython-39.pyc
--- a/libai/models/__pycache__/t5_model.cpython-39.pyc
+++ b/libai/models/__pycache__/t5_model.cpython-39.pyc
--- a/libai/models/__pycache__/vision_transformer.cpython-39.pyc
+++ b/libai/models/__pycache__/vision_transformer.cpython-39.pyc
--- a/libai/models/bert_model.py
+++ b/libai/models/bert_model.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.config import configurable
+from libai.layers import (
+    Embedding,
+    LayerNorm,
+    Linear,
+    LMLogits,
+    ParallelCrossEntropyLoss,
+    TransformerLayer,
+    VocabEmbedding,
+    build_activation,
+)
+from libai.layers.attention import AttnMaskType
+from libai.utils import distributed as dist
+
+from .utils import init_method_normal, scaled_init_method_normal
+
+
+class BertExtendedAttnMask(nn.Module):
+    def forward(self, attention_mask):
+        # We create a 3D attention mask from a 2D tensor mask.
+        # [b, 1, s]
+        attention_mask_b1s = attention_mask.unsqueeze(1)
+        # [b, s, 1]
+        attention_mask_bs1 = attention_mask.unsqueeze(2)
+        # [b, s, s]
+        attention_mask_bss = attention_mask_b1s * attention_mask_bs1
+        # [b, 1, s, s]
+        extended_attention_mask = attention_mask_bss.unsqueeze(1)
+
+        return extended_attention_mask
+
+
+class BertEmbeddings(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        num_tokentypes=0,
+        init_method=nn.init.xavier_normal_,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        self.vocab_embeddings = VocabEmbedding(
+            vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.position_embeddings = Embedding(
+            max_sequence_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+
+        # NOTE(l1aoxingyu): Set position_ids sbp sign to [B, B] initially, because position_ids is a
+        # 1D-tensor from 0 to seq_length, if set to [S(0), B] at first, then position_ids
+        # will split at the first dim of hierarchy.
+        self.position_ids = flow.arange(
+            max_sequence_length,
+            dtype=flow.long,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(0),
+        ).unsqueeze(0)
+
+        if num_tokentypes > 0:
+            self.tokentype_embeddings = Embedding(
+                num_tokentypes, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+            )
+            self.tokentype_ids = flow.zeros(
+                self.position_ids.size(),
+                dtype=flow.long,
+                sbp=self.position_ids.sbp,
+                placement=self.position_ids.placement,
+            )
+        else:
+            self.tokentype_embeddings = None
+
+        self.embedding_dropout = nn.Dropout(embedding_dropout_prob)
+
+    def forward(self, input_ids, tokentype_ids=None, position_ids=None):
+        seq_length = input_ids.size()[1]
+
+        word_embeddings = self.vocab_embeddings(input_ids)
+        if position_ids is None:
+            # Change position_ids sbp sign: [B, B] -> [S(0), B]
+            position_ids = (
+                self.position_ids[:, :seq_length].expand_as(input_ids).to_global(sbp=input_ids.sbp)
+            )
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = word_embeddings + position_embeddings
+
+        if self.tokentype_embeddings is not None:
+            if tokentype_ids is None:
+                tokentype_ids = (
+                    self.tokentype_ids[:, :seq_length]
+                    .expand_as(input_ids)
+                    .to_global(sbp=input_ids.sbp)
+                )
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+
+        embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def word_embeddings(self):
+        return self.vocab_embeddings.weight
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, hidden_size, init_method):
+        super().__init__()
+        self.dense = Linear(
+            hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="data",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.activation_func = build_activation("gelu")
+        self.layernorm = LayerNorm((hidden_size,), layer_idx=-1)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation_func(hidden_states)
+        hidden_states = hidden_states.to_global(
+            grad_sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.split(2)])
+        )
+
+        # NOTE(l1aoxingyu): hidden_states shape is [B, S, H] whose sbp sign: [S(0), S(2)]
+        # Change from [S(0), S(2)] -> [S(0), B] because layernorm cannot get inputs with sbp S(2)
+        hidden_states = hidden_states.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])
+        )
+        hidden_states = self.layernorm(hidden_states)
+        return hidden_states
+
+
+class BertPooler(nn.Module):
+    """Pooler layer.
+
+    Pool hidden states of the first token and
+    add a linear transformation followed by a tanh.
+
+    Args:
+        hidden_size: hidden state feature dimension
+    """
+
+    def __init__(self, hidden_size, init_method):
+        super().__init__()
+        self.dense = Linear(
+            hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="col",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.activation_func = build_activation("tanh")
+
+    def forward(self, hidden_states):
+        """Just "pool" the model by simply taking the [CLS] token corresponding
+        to the first token."""
+        # hidden_states: [bsz, seq_len, hidden_size]
+        select_token_tensor = hidden_states[:, 0, :]
+        pooled_output = self.dense(select_token_tensor)
+        pooled_output = self.activation_func(pooled_output)
+        return pooled_output
+
+
+class BertLoss(nn.Module):
+    def __init__(self, add_binary_head):
+        super().__init__()
+        self.add_binary_head = add_binary_head
+        self.lm_loss = ParallelCrossEntropyLoss()
+
+    def forward(self, lm_output, lm_labels, loss_mask, binary_logits, ns_labels):
+        lm_labels = lm_labels.to_global(placement=lm_output.placement)
+        loss_mask = loss_mask.to_global(placement=lm_output.placement)
+        binary_logits = binary_logits.to_global(placement=lm_output.placement)
+        ns_labels = ns_labels.to_global(placement=lm_output.placement)
+        lm_loss = self.lm_loss(lm_output, lm_labels)
+        loss_mask = loss_mask.float()
+        # Change loss_mask.sum() sbp sign from [P, B] -> [B, B]
+        # because (lm_loss * loss_mask) / loss_mask.sum() cannot accept P / P
+        denominator = (
+            loss_mask.sum().to_global(sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
+            + 1e-7
+        )
+        masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator
+        # NOTE(l1aoxingyu): Change lm loss sbp sign [P, P] -> [P, B] to add with sop loss
+        # whose sbp sign: [P, B]
+        masked_lm_loss = masked_lm_loss.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast])
+        )
+
+        loss_dict = {"lm_loss": masked_lm_loss}
+
+        if self.add_binary_head:
+            sop_loss = flow._C.cross_entropy(
+                binary_logits, ns_labels, ignore_index=-1, reduction="none"
+            ).mean()
+            loss_dict["sop_loss"] = sop_loss
+        return loss_dict
+
+
+class BertModel(nn.Module):
+    """The bare Bert Model transformer outputting raw hidden-states without
+    any specific head on top.
+
+    Args:
+        vocab_size (int): The size of vocabulary file.
+        hidden_size (int): The size of hidden states.
+        hidden_layers (int): The number of ``TransformerLayer`` in encoder.
+        num_attention_heads (int):
+            The number of attention heads for each attention layer of ``TransformerLayer``.
+        intermediate_size (int):
+            The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
+        hidden_dropout_prob  (float, optional):
+            The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
+        attention_probs_dropout_prob  (float, optional):
+            The dropout ratio for the output of each attention layer in ``TransformerLayer``.
+            Defaults to 0.0.
+        max_position_embeddings (int):
+            Max sequence length of input, defines the shape of Position Embeddings
+            in ``BertEmbedding``.
+        num_tokentypes (int, optional):
+            Number of segment token indices. Defaults to 2.
+        add_pooling_layer (bool, optional):
+            Whether or not averaging or pooling the sequence of hidden-states for the
+            whole input sequence. Defaults to ``True``.
+        initializer_range (float, optional):
+            Sigma of the normal distribution in the initialization method. Defaults to 0.02.
+        layernorm_epsilon (float, optional):
+            The epsilon of LayerNorm layer. Defaults to 1e-5.
+        bias_gelu_fusion (bool, optional):
+            Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
+        bias_dropout_fusion (bool, optional):
+            Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
+        scale_mask_softmax_fusion (bool, optional):
+            Whether to fuse the computing of mask and softmax in attention layers.
+            Defaults to ``False``.
+        apply_query_key_layer_scaling (bool, optional):
+            Whether or not to use layer index related scaling in computing attention scores.
+            If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
+            Defaults to ``True``.
+        apply_residual_post_layernorm (bool, optional):
+            If set ``True``, use original BERT residual connection ordering otherwise use Megatron
+            BERT residual connection which is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        amp_enabled (bool, optional):
+            Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        hidden_layers,
+        num_attention_heads,
+        intermediate_size,
+        hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        max_position_embeddings,
+        num_tokentypes=2,
+        add_pooling_layer=True,
+        initializer_range=0.02,
+        layernorm_eps=1e-12,
+        bias_gelu_fusion=True,
+        bias_dropout_fusion=True,
+        scale_mask_softmax_fusion=True,
+        apply_query_key_layer_scaling=True,
+        apply_residual_post_layernorm=False,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        init_method = init_method_normal(initializer_range)
+        scaled_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
+
+        # Embeddings
+        self.embeddings = BertEmbeddings(
+            vocab_size,
+            hidden_size,
+            max_position_embeddings,
+            hidden_dropout_prob,
+            num_tokentypes,
+            init_method,
+            amp_enabled,
+        )
+
+        # Mask generation
+        self.extended_attn_mask = BertExtendedAttnMask()
+
+        # Encoders
+        self.encoders = nn.ModuleList(
+            [
+                TransformerLayer(
+                    hidden_size,
+                    intermediate_size,
+                    num_attention_heads,
+                    attention_dropout_prob=attention_probs_dropout_prob,
+                    output_dropout_prob=hidden_dropout_prob,
+                    layernorm_epsilon=layernorm_eps,
+                    bias_gelu_fusion=bias_gelu_fusion,
+                    bias_dropout_fusion=bias_dropout_fusion,
+                    scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+                    apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+                    init_method=init_method,
+                    output_layer_init_method=scaled_init_method,
+                    apply_residual_post_layernorm=apply_residual_post_layernorm,
+                    attn_mask_type=AttnMaskType.padding,  # bert mask type
+                    layer_idx=i,
+                )
+                for i in range(hidden_layers)
+            ]
+        )
+        self.final_layernorm = LayerNorm((hidden_size,), eps=layernorm_eps, layer_idx=-1)
+
+        self.pooler = BertPooler(hidden_size, init_method) if add_pooling_layer else None
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "hidden_layers": cfg.hidden_layers,
+            "num_attention_heads": cfg.num_attention_heads,
+            "intermediate_size": cfg.intermediate_size,
+            "hidden_dropout_prob": cfg.hidden_dropout_prob,
+            "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
+            "max_position_embeddings": cfg.max_position_embeddings,
+            "num_tokentypes": cfg.num_tokentypes,
+            "add_pooling_layer": cfg.add_pooling_layer,
+            "initializer_range": cfg.initializer_range,
+            "layernorm_eps": cfg.layernorm_eps,
+            "bias_gelu_fusion": cfg.bias_gelu_fusion,
+            "bias_dropout_fusion": cfg.bias_dropout_fusion,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
+            "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
+            "amp_enabled": cfg.amp_enabled,
+        }
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention
+                on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first and
+                second portions of the inputs. Indices are selected in `[0, 1]`. Defaults to None.
+        """
+        extended_attention_mask = self.extended_attn_mask(attention_mask)
+        embedding_output = self.embeddings(input_ids, tokentype_ids)
+
+        hidden_states = embedding_output
+        for layer in self.encoders:
+            hidden_states = layer(hidden_states, extended_attention_mask)
+        encoder_output = self.final_layernorm(hidden_states)
+        pooled_output = self.pooler(encoder_output) if self.pooler is not None else None
+        return encoder_output, pooled_output
+
+    def word_embeddings_weight(self):
+        return self.embeddings.word_embeddings()
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, vocab_size, hidden_size, init_method, add_binary_head=True):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(hidden_size, init_method)
+        self.seq_relationship = Linear(
+            hidden_size,
+            2,
+            bias=True,
+            parallel="data",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.lm_logits = LMLogits(vocab_size, bias=True)
+        self.loss_func = BertLoss(add_binary_head)
+
+    def forward(
+        self,
+        sequence_output,
+        pooled_output,
+        word_embeddings_weight,
+        ns_labels,
+        lm_labels,
+        loss_mask,
+    ):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        prediction_scores = self.lm_logits(prediction_scores, word_embeddings_weight)
+
+        if lm_labels is not None:
+            return self.loss_func(
+                prediction_scores, lm_labels, loss_mask, seq_relationship_score, ns_labels
+            )
+        return {
+            "prediction_scores": prediction_scores,
+            "seq_relationship_score": seq_relationship_score,
+        }
+
+
+class BertForPreTraining(nn.Module):
+    """Bert Model with two heads on top as done during the pretraining: a
+    `masked language modeling` head and a `next sentence prediction (classification)` head.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.bert = BertModel(cfg)
+        self.cls_head = BertPreTrainingHeads(
+            cfg.vocab_size,
+            cfg.hidden_size,
+            init_method_normal(cfg.initializer_range),
+            cfg.add_binary_head,
+        )
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        tokentype_ids=None,
+        ns_labels=None,
+        lm_labels=None,
+        loss_mask=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention on
+                padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
+                and second portions of the inputs. Indices are selected in `[0, 1]`.
+                Defaults to None.
+            ns_labels (flow.LongTensor, optional): Labels for computing the next sequence prediction
+                (classification) loss. Input should be a sequence pair (see `input_ids` docstring).
+                Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+
+            lm_labels (flow.LongTensor, optional): Labels for computing the masked
+                language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
+            loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
+                on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        attention_mask = attention_mask.to_global(placement=dist.get_layer_placement(0))
+        tokentype_ids = tokentype_ids.to_global(placement=dist.get_layer_placement(0))
+        outputs = self.bert(input_ids, attention_mask, tokentype_ids)
+        sequence_output, pooled_output = outputs[:2]
+
+        return self.cls_head(
+            sequence_output,
+            pooled_output,
+            self.bert.word_embeddings_weight(),
+            ns_labels,
+            lm_labels,
+            loss_mask,
+        )
+
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.bert.final_layernorm, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.origin, BertEmbeddings):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, BertExtendedAttnMask):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.origin, BertPooler):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.origin, BertPreTrainingHeads):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.bert.final_layernorm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), BertEmbeddings):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), BertExtendedAttnMask):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.to(nn.Module), BertPooler):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.to(nn.Module), BertPreTrainingHeads):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.bert.final_layernorm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+
+class BertForClassification(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.num_labels = cfg.num_labels
+
+        self.bert = BertModel(cfg)
+        self.classifier = Linear(
+            cfg.hidden_size,
+            cfg.num_labels,
+            bias=True,
+            parallel="row",
+            init_method=init_method_normal(cfg.initializer_range),
+            layer_idx=-1,
+        )
+        classifier_dropout = (
+            cfg.classifier_dropout
+            if cfg.classifier_dropout is not None
+            else cfg.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+
+    def forward(self, input_ids, attention_mask, tokentype_ids=None, labels=None, **kwargs):
+        labels = labels if labels is not None else kwargs.get("ns_labels")
+        outputs = self.bert(input_ids, attention_mask, tokentype_ids)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            loss = loss.to_global(sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast]))
+            return {"cls_loss": loss}
+        else:
+            return {"logits": logits}
--- a/libai/models/build.py
+++ b/libai/models/build.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libai.config import instantiate, try_get_key
+
+
+def build_model(cfg):
+    """Build the whole model architecture, defined by ``cfg.model``.
+    Note that it does not load any weights from ``cfg``.
+    """
+    model = instantiate(cfg)
+    return model
+
+
+def build_graph(cfg, model, optimizer=None, lr_scheduler=None, is_train=False):
+    """Build the `nn.Graph`, defined by ``cfg.graph``."""
+    auto_parallel_conf = try_get_key(cfg, "graph.auto_parallel", default=None)
+    if is_train:
+        # Set train graph
+        assert optimizer is not None, "optimizer must be set for train graph"
+        assert lr_scheduler is not None, "lr_scheduler must be set for train graph"
+        graph = cfg.graph.train_graph
+        graph.model = model
+        graph.optimizer = optimizer
+        graph.lr_scheduler = lr_scheduler
+        graph.fp16 = try_get_key(cfg, "train.amp.enabled", default=False)
+        graph.activation_checkpoint = try_get_key(
+            cfg, "train.activation_checkpoint.enabled", default=False
+        )
+        graph.zero_optim = try_get_key(cfg, "train.zero_optimization.enabled", default=False)
+        graph.zero_stage = try_get_key(cfg, "train.zero_optimization.stage", default=1)
+        graph.grad_acc_steps = try_get_key(cfg, "train.num_accumulation_steps", default=1)
+        graph.auto_parallel_conf = auto_parallel_conf
+        return instantiate(graph)
+    else:
+        # Set eval graph
+        graph = cfg.graph.eval_graph
+        graph.model = model
+        graph.auto_parallel_conf = auto_parallel_conf
+        return instantiate(graph)
--- a/libai/models/gpt_model.py
+++ b/libai/models/gpt_model.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+from oneflow.nn import init
+
+from libai.config import configurable
+from libai.layers import (
+    Embedding,
+    LayerNorm,
+    LMLogits,
+    ParallelCrossEntropyLoss,
+    TransformerLayer,
+    VocabEmbedding,
+)
+from libai.layers.attention import AttnMaskType
+from libai.utils import distributed as dist
+
+from .utils import init_method_normal, scaled_init_method_normal
+
+
+class CasualMask(nn.Module):
+    """
+    Create a casual mask and combine it with the padding mask.
+    It will be used in gpt model and T5 decoder.
+    When in T5 decoder, the argument `layer_idx` should be set to first decoder layer index.
+    """
+
+    def __init__(self, max_positions=1024, *, layer_idx=0):
+        super().__init__()
+        self.mask = flow.tril(
+            flow.ones(
+                (max_positions, max_positions),
+                dtype=flow.int8,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+
+    def forward(self, input_ids, past_length=0, attention_mask=None):
+        bsz, tgt_len = input_ids.size()
+        casual_mask = self.mask[:tgt_len, :tgt_len]
+        if past_length > 0:
+            # in case past_key_values are used, we need to add a prefix ones mask to casual mask
+            casual_mask = flow.cat(
+                [flow.ones(tgt_len, past_length, dtype=flow.int8), casual_mask], dim=-1
+            )
+        casual_mask = (
+            casual_mask.unsqueeze(0).unsqueeze(1).expand(bsz, 1, tgt_len, tgt_len + past_length)
+        )
+        casual_mask = casual_mask.to_global(sbp=input_ids.sbp)
+        if attention_mask is not None:
+            assert attention_mask.dim() == 4, "please extend the attention mask first"
+            casual_mask = casual_mask * attention_mask
+        return casual_mask
+
+
+class GPTModel(nn.Module):
+    """GPT-2 language model. The output of the forward method is logits.
+
+    Args:
+        hidden_layers (int): The number of ``TransformerLayer`` in the gpt model.
+        vocab_size (int): The size of vocabulary file.
+        hidden_size (int): The size of hidden states.
+        ffn_hidden_size (int):
+            The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
+        num_attention_heads (int):
+            The number of attention heads for each attention layer of ``TransformerLayer``.
+        max_seq_length (int, optional):
+            Max sequence length of input, defines the shape of Position Embeddings in GPTEmebedding.
+            Defaults to 1024.
+        embedding_dropout_prob (float, optional):
+            The dropout ratio for the output of GPTEmbedding Layer. Defaults to 0.0.
+        attention_dropout_prob (float, optional):
+            The dropout ratio for the output of each attention layer in ``TransformerLayer``.
+            Defaults to 0.0.
+        output_dropout_prob (float, optional):
+            The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
+        layernorm_epsilon (float, optional):
+            The epsilon of LayerNorm layer. Defaults to 1e-5.
+        initializer_range (float, optional):
+            Sigma of the normal distribution in the initialization method. Defaults to 0.02.
+        use_scaled_init_for_output_weights (bool, optional): Defaults to ``True``.
+        bias_gelu_fusion (bool, optional):
+            Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
+        bias_dropout_fusion (bool, optional):
+            Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
+        scale_mask_softmax_fusion (bool, optional):
+            Whether to fuse the computing of mask and softmax in attention layers.
+            Defaults to ``False``.
+        apply_query_key_layer_scaling (bool, optional):
+            Whether or not to use layer index related scaling in computing attention scores.
+            If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
+            Defaults to ``False``.
+        apply_residual_post_layernorm (bool, optional):
+            If set ``True``, use original BERT residual connection ordering otherwise use Megatron
+            BERT residual connection which is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        amp_enabled (bool, optional):
+            Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        hidden_layers,
+        vocab_size,
+        hidden_size,
+        ffn_hidden_size,
+        num_attention_heads,
+        max_seq_length=1024,
+        embedding_dropout_prob=0.0,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        layernorm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_scaled_init_for_output_weights=True,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        apply_residual_post_layernorm=False,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        init_method = init_method_normal(sigma=initializer_range)
+        if use_scaled_init_for_output_weights:
+            output_layer_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
+        else:
+            output_layer_init_method = init_method
+
+        self.embeddings = GPTEmbedding(
+            vocab_size,
+            hidden_size,
+            max_seq_length,
+            init_method=init_method,
+            embedding_dropout_prob=embedding_dropout_prob,
+            amp_enabled=amp_enabled,
+        )
+
+        self.transformer = Transformer(
+            hidden_layers,
+            hidden_size,
+            ffn_hidden_size,
+            num_attention_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            output_dropout_prob=output_dropout_prob,
+            layernorm_epsilon=layernorm_epsilon,
+            init_method=init_method,
+            output_layer_init_method=output_layer_init_method,
+            bias_gelu_fusion=bias_gelu_fusion,
+            bias_dropout_fusion=bias_dropout_fusion,
+            scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+            apply_residual_post_layernorm=apply_residual_post_layernorm,
+        )
+
+        self.lm_head = LMLogits(vocab_size, bias=False)
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "hidden_layers": cfg.hidden_layers,
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "ffn_hidden_size": cfg.ffn_hidden_size,
+            "num_attention_heads": cfg.num_attention_heads,
+            "max_seq_length": cfg.max_seq_length,
+            "embedding_dropout_prob": cfg.embedding_dropout_prob,
+            "attention_dropout_prob": cfg.attention_dropout_prob,
+            "output_dropout_prob": cfg.output_dropout_prob,
+            "layernorm_epsilon": cfg.layernorm_epsilon,
+            "initializer_range": cfg.initializer_range,
+            "use_scaled_init_for_output_weights": cfg.use_scaled_init_for_output_weights,
+            "bias_gelu_fusion": cfg.bias_gelu_fusion,
+            "bias_dropout_fusion": cfg.bias_dropout_fusion,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
+            "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
+            "amp_enabled": cfg.amp_enabled,
+        }
+
+    def forward(self, input_ids):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+
+        Returns:
+            flow.Tensor: logits
+        """
+
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        input_embeds = self.embeddings(input_ids, 0)
+
+        transformer_output = self.transformer(input_embeds, attention_mask=None)
+
+        output = self.lm_head(transformer_output, self.embeddings.token_embeddings.weight)
+
+        return output
+
+
+class GPTEmbedding(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_seq_length,
+        init_method=init.xavier_normal_,
+        embedding_dropout_prob=0.0,
+        amp_enabled=False,
+    ):
+        super().__init__()
+        self.token_embeddings = VocabEmbedding(
+            vocab_size, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.position_embeddings = Embedding(
+            max_seq_length, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+        )
+        self.dropout = nn.Dropout(embedding_dropout_prob)
+
+        self.position_ids = flow.arange(
+            max_seq_length,
+            dtype=flow.long,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(0),
+        ).unsqueeze(0)
+
+    def forward(self, input_ids, past_length=0):
+        bsz, seq_length = input_ids.size()
+
+        position_ids = self.position_ids[:, past_length : past_length + seq_length]
+        position_ids = position_ids.expand_as(input_ids).to_global(sbp=input_ids.sbp)
+
+        token_embeds = self.token_embeddings(input_ids)
+        position_embeds = self.position_embeddings(position_ids)
+        input_embeds = token_embeds + position_embeds
+        input_embeds = self.dropout(input_embeds)
+        return input_embeds
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        hidden_layers,
+        hidden_size,
+        ffn_hidden_size,
+        num_attention_heads,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        layernorm_epsilon=1e-5,
+        init_method=init.xavier_normal_,
+        output_layer_init_method=None,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=False,
+        apply_residual_post_layernorm=False,
+    ):
+        super().__init__()
+        self.hidden_layers = hidden_layers
+
+        def build_layer(layer_number):
+            return TransformerLayer(
+                hidden_size,
+                ffn_hidden_size,
+                num_attention_heads,
+                attention_dropout_prob=attention_dropout_prob,
+                output_dropout_prob=output_dropout_prob,
+                layernorm_epsilon=layernorm_epsilon,
+                init_method=init_method,
+                output_layer_init_method=output_layer_init_method,
+                bias_gelu_fusion=bias_gelu_fusion,
+                bias_dropout_fusion=bias_dropout_fusion,
+                scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+                apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+                apply_residual_post_layernorm=apply_residual_post_layernorm,
+                attn_mask_type=AttnMaskType.causal,
+                layer_idx=layer_number,
+            )
+
+        self.layers = nn.ModuleList([build_layer(i) for i in range(self.hidden_layers)])
+        self.layernorm_f = LayerNorm(hidden_size, eps=layernorm_epsilon, layer_idx=-1)
+
+    def forward(self, hidden_states, attention_mask):
+        # hidden_states shape: (batch_size, seq_length, hidden_size)
+        # sbp: [S(0), B]
+        for i, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states, attention_mask)
+
+        output = self.layernorm_f(hidden_states)
+
+        return output
+
+
+class GPTLoss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.lm_loss = ParallelCrossEntropyLoss()
+
+    def forward(self, logits, lm_labels):
+        lm_loss = self.lm_loss(logits, lm_labels)
+        lm_loss = lm_loss.mean()
+        return {"lm_loss": lm_loss}
+
+
+class GPTForPreTraining(nn.Module):
+    """
+    GPT Model with classification head on top.
+    """
+
+    def __init__(self, cfg) -> None:
+        super().__init__()
+        self.GPT_model = GPTModel(cfg)
+        self.loss_func = GPTLoss()
+
+    def forward(
+        self,
+        input_ids,
+        labels=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            labels (flow.LongTensor, optional): Labels for computing language modeling loss.
+                None for evaluating. Defaults to None.
+
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation.
+                :code:`{"masked_lm_loss": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        logits = self.GPT_model(input_ids)
+        if labels is not None:
+            lm_loss = self.loss_func(logits, labels)
+            return lm_loss
+        else:
+            return {"prediction_scores": logits}
+
+    @staticmethod
+    def set_pipeline_stage_id(model: nn.Module):
+        dist_utils = dist.get_dist_util()
+
+        if hasattr(model.GPT_model.transformer.layernorm_f, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                if isinstance(module_block.origin, (GPTEmbedding, CasualMask)):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.origin, (LMLogits, GPTLoss)):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            model.GPT_model.transformer.layernorm_f.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), (GPTEmbedding, CasualMask)):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.to(nn.Module), (LMLogits, GPTLoss)):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            model.GPT_model.transformer.layernorm_f.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
--- a/libai/models/resmlp.py
+++ b/libai/models/resmlp.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# --------------------------------------------------------
+# ResMLP Model
+# References:
+# resmlp: https://github.com/facebookresearch/deit/blob/main/resmlp_models.py
+# --------------------------------------------------------
+
+import oneflow as flow
+import oneflow.nn as nn
+from flowvision.layers.weight_init import trunc_normal_
+
+import libai.utils.distributed as dist
+from libai.config import configurable
+from libai.layers import MLP, DropPath, LayerNorm, Linear, PatchEmbedding
+
+
+class Affine(nn.Module):
+    def __init__(self, dim, *, layer_idx=0):
+        super().__init__()
+        self.alpha = nn.Parameter(
+            flow.ones(
+                dim,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+        self.beta = nn.Parameter(
+            flow.zeros(
+                dim,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            ),
+        )
+
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
+        return self.alpha * x + self.beta
+
+
+class layers_scale_mlp_blocks(nn.Module):
+    def __init__(
+        self, dim, drop=0.0, drop_path=0.0, init_values=1e-4, num_patches=196, *, layer_idx=0
+    ):
+        super().__init__()
+        self.norm1 = Affine(dim, layer_idx=layer_idx)
+        self.attn = Linear(num_patches, num_patches, layer_idx=layer_idx)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = Affine(dim, layer_idx=layer_idx)
+        self.mlp = MLP(hidden_size=dim, ffn_hidden_size=int(4.0 * dim), layer_idx=layer_idx)
+        self.gamma_1 = nn.Parameter(
+            init_values
+            * flow.ones(
+                dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(layer_idx),
+            ),
+            requires_grad=True,
+        )
+        self.gamma_2 = nn.Parameter(
+            init_values
+            * flow.ones(
+                dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(layer_idx),
+            ),
+            requires_grad=True,
+        )
+
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        x = x.to_global(placement=dist.get_layer_placement(self.layer_idx))
+        x = x + self.drop_path(
+            self.gamma_1 * self.attn(self.norm1(x).transpose(1, 2)).transpose(1, 2)
+        )
+        x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class ResMLP(nn.Module):
+    """ResMLP in LiBai.
+
+    LiBai's implementation of:
+    `ResMLP: Feedforward networks for image classification with data-efficient training
+    <https://arxiv.org/abs/2105.03404>`_
+
+    Args:
+        img_size (int, tuple(int)): input image size
+        patch_size (int, tuple(int)): patch size
+        in_chans (int): number of input channels
+        embed_dim (int): embedding dimension
+        depth (int): depth of transformer
+        drop_rate (float): dropout rate
+        drop_path_rate (float): stochastic depth rate
+        init_scale (float): the layer scale ratio
+        num_classes (int): number of classes for classification head
+        loss_func (callable, optional): loss function for computing the total loss
+                                        between logits and labels
+
+    """
+
+    @configurable
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        drop_rate=0.0,
+        drop_path_rate=0.0,
+        init_scale=1e-4,
+        num_classes=1000,
+        loss_func=None,
+    ):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbedding(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        num_patches = self.patch_embed.num_patches
+        dpr = [drop_path_rate for i in range(depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList(
+            [
+                layers_scale_mlp_blocks(
+                    dim=embed_dim,
+                    drop=drop_rate,
+                    drop_path=dpr[i],
+                    init_values=init_scale,
+                    num_patches=num_patches,
+                    layer_idx=i,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        self.norm = Affine(embed_dim, layer_idx=-1)
+        self.head = (
+            Linear(embed_dim, num_classes, layer_idx=-1) if num_classes > 0 else nn.Identity()
+        )
+
+        # loss func
+        self.loss_func = nn.CrossEntropyLoss() if loss_func is None else loss_func
+
+        # weight init
+        self.apply(self._init_weights)
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "img_size": cfg.img_size,
+            "patch_size": cfg.patch_size,
+            "in_chans": cfg.in_chans,
+            "embed_dim": cfg.embed_dim,
+            "depth": cfg.depth,
+            "drop_rate": cfg.drop_rate,
+            "drop_path_rate": cfg.drop_path_rate,
+            "init_scale": cfg.init_scale,
+            "num_classes": cfg.num_classes,
+            "loss_func": cfg.loss_func,
+        }
+
+    def _init_weights(self, m):
+        if isinstance(m, Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+
+        # layer scale mlp blocks
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+
+        return x
+
+    def forward_head(self, x):
+        B = x.shape[0]
+        x = self.norm(x)
+        x = x.mean(dim=1).reshape(B, 1, -1)
+        return self.head(x[:, 0])
+
+    def forward(self, images, labels=None):
+        """
+
+        Args:
+            images (flow.Tensor): training samples.
+            labels (flow.LongTensor, optional): training targets
+
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation mode.
+                :code:`{"losses": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        x = self.forward_features(images)
+        x = self.forward_head(x)
+
+        if labels is not None and self.training:
+            losses = self.loss_func(x, labels)
+            return {"losses": losses}
+        else:
+            return {"prediction_scores": x}
+
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.loss_func, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.origin, PatchEmbedding):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, layers_scale_mlp_blocks):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            # Set norm and head stage id
+            model.norm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), PatchEmbedding):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            # Set norm and head stage id
+            model.norm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+    @staticmethod
+    def set_activation_checkpoint(model):
+        for module_block in model.modules():
+            if hasattr(module_block, "origin"):
+                # Old API in OneFlow 0.8
+                if isinstance(module_block.origin, layers_scale_mlp_blocks):
+                    module_block.config.activation_checkpointing = True
+            else:
+                if isinstance(module_block.to(nn.Module), layers_scale_mlp_blocks):
+                    module_block.to(nn.graph.GraphModule).activation_checkpointing = True
--- a/libai/models/roberta_model.py
+++ b/libai/models/roberta_model.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.config import configurable
+from libai.layers import (
+    Embedding,
+    LayerNorm,
+    Linear,
+    LMLogits,
+    ParallelCrossEntropyLoss,
+    TransformerLayer,
+    VocabEmbedding,
+    build_activation,
+)
+from libai.utils import distributed as dist
+
+from .bert_model import BertEmbeddings, BertExtendedAttnMask, BertModel, BertPooler
+from .utils import init_method_normal
+
+
+class RobertaExtendedAttnMask(BertExtendedAttnMask):
+    """
+    Same as BertExtendedAttnMask.
+    """
+
+
+class RobertaEmbeddings(BertEmbeddings):
+    """
+    Same as BertEmbeddings with a tiny tweak for vocab_embeddings and position_embeddings.
+    """
+
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        num_tokentypes=0,
+        pad_token_id=1,
+        init_method=nn.init.xavier_normal_,
+        amp_enabled=False,
+    ):
+        super().__init__(
+            vocab_size,
+            hidden_size,
+            max_sequence_length,
+            embedding_dropout_prob,
+            num_tokentypes=num_tokentypes,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+        )
+        self.pad_token_id = pad_token_id
+        self.vocab_embeddings = VocabEmbedding(
+            vocab_size,
+            hidden_size,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+            padding_idx=pad_token_id,
+        )
+        self.position_embeddings = Embedding(
+            max_sequence_length,
+            hidden_size,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+            padding_idx=pad_token_id,
+        )
+
+        if num_tokentypes > 0:
+            self.tokentype_embeddings = Embedding(
+                num_tokentypes, hidden_size, init_method=init_method, amp_enabled=amp_enabled
+            )
+            self.tokentype_ids = flow.zeros(
+                1,
+                max_sequence_length,
+                dtype=flow.long,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(0),
+            )
+        else:
+            self.tokentype_embeddings = None
+
+    def forward(self, input_ids, tokentype_ids=None, position_ids=None):
+        seq_length = input_ids.size()[1]
+
+        word_embeddings = self.vocab_embeddings(input_ids)
+
+        if position_ids is None:
+            position_ids = self.create_position_ids_from_input_ids(input_ids, self.pad_token_id)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = word_embeddings + position_embeddings
+
+        if self.tokentype_embeddings is not None:
+            if tokentype_ids is None:
+                tokentype_ids = (
+                    self.tokentype_ids[:, :seq_length]
+                    .expand_as(input_ids)
+                    .to_global(sbp=input_ids.sbp)
+                )
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_input_ids(self, input_ids, pad_token_id):
+        mask = input_ids.ne(pad_token_id).int()
+        position_ids = (flow.cumsum(mask, dim=1).type_as(mask)) * mask + pad_token_id
+        position_ids = position_ids.to_global(sbp=input_ids.sbp, placement=input_ids.placement)
+        return position_ids
+
+
+class RobertaPooler(BertPooler):
+    """
+    Same as BertPooler.
+    """
+
+
+class RobertaLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lm_loss = ParallelCrossEntropyLoss()
+
+    def forward(self, lm_output, lm_labels, loss_mask):
+        lm_labels = lm_labels.to_global(placement=lm_output.placement)
+        loss_mask = loss_mask.to_global(placement=lm_output.placement)
+        lm_loss = self.lm_loss(lm_output, lm_labels)
+        loss_mask = loss_mask.float()
+        # Change loss_mask.sum() sbp sign from [P, B] -> [B, B]
+        # because (lm_loss * loss_mask) / loss_mask.sum() cannot accept P / P
+        denominator = loss_mask.sum().to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        )
+        masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator
+        masked_lm_loss = masked_lm_loss.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast])
+        )
+        loss_dict = {"lm_loss": masked_lm_loss}
+        return loss_dict
+
+
+class RobertaModel(BertModel):
+    """The bare Roberta Model transformer outputting raw hidden-states without
+    any specific head on top.
+
+        Args:
+            vocab_size (int):
+                The size of vocabulary file.
+            hidden_size (int):
+                The size of hidden states.
+            hidden_layers (int):
+                The number of ``TransformerLayer`` in encoder.
+            num_attention_heads (int):
+                The number of attention heads for each attention layer of ``TransformerLayer``.
+            intermediate_size (int):
+                The size of intermediate layer in feed-forward network for each
+                ``TransformerLayer``.
+            hidden_dropout_prob (float, optional):
+                The dropout ratio for the output for each TransformerLayer. Defaults to 0.0.
+            attention_probs_dropout_prob (float, optional):
+                The dropout ratio for the output of each attention layer in ``TransformerLayer``.
+                Defaults to 0.0.
+            max_position_embeddings (int):
+                Max sequence length of input, defines the shape of Position Embeddings
+                in ``RobertaEmbeddings``.
+            type_vocab_size (int, optional):
+                Number of segment token indices. Defaults to 2.
+            add_pooling_layer (bool, optional):
+                Whether or not averaging or pooling the sequence of hidden-states for the
+                whole input sequence. Defaults to ``True``.
+            initializer_range (float, optional):
+                Sigma of the normal distribution in the initialization method. Defaults to 0.02.
+            layer_norm_eps (float, optional):
+                The epsilon of LayerNorm layer. Defaults to 1e-5.
+            pad_token_id (int, optional):
+                The token id used for padding. Defaults to 1.
+            bias_gelu_fusion (bool, optional):
+                Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
+            bias_dropout_fusion (bool, optional):
+                Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
+            scale_mask_softmax_fusion (bool, optional):
+                Whether to fuse the computing of mask and softmax in attention layers.
+                Defaults to ``False``.
+            apply_query_key_layer_scaling (bool, optional):
+                Whether or not to use layer index related scaling in computing attention scores.
+                If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
+                Defaults to ``True``.
+            apply_residual_post_layernorm (bool, optional):
+                If set ``True``, use original BERT(Roberta) residual connection ordering
+                otherwise use Megatron BERT residual connection which is more stable
+                when scaling model size introduced in https://arxiv.org/pdf/1909.08053.pdf.
+                Default: ``False``.
+            amp_enabled (bool, optional):
+                Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        hidden_layers,
+        num_attention_heads,
+        intermediate_size,
+        hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        max_position_embeddings,
+        num_tokentypes=2,
+        add_pooling_layer=True,
+        initializer_range=0.02,
+        layernorm_eps=1e-12,
+        pad_token_id=1,
+        bias_gelu_fusion=True,
+        bias_dropout_fusion=True,
+        scale_mask_softmax_fusion=True,
+        apply_query_key_layer_scaling=True,
+        apply_residual_post_layernorm=False,
+        amp_enabled=False,
+    ):
+        super().__init__(
+            vocab_size,
+            hidden_size,
+            hidden_layers,
+            num_attention_heads,
+            intermediate_size,
+            hidden_dropout_prob,
+            attention_probs_dropout_prob,
+            max_position_embeddings,
+            num_tokentypes=num_tokentypes,
+            add_pooling_layer=add_pooling_layer,
+            initializer_range=initializer_range,
+            layernorm_eps=layernorm_eps,
+            bias_gelu_fusion=bias_gelu_fusion,
+            bias_dropout_fusion=bias_dropout_fusion,
+            scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+            apply_residual_post_layernorm=apply_residual_post_layernorm,
+            amp_enabled=amp_enabled,
+        )
+
+        init_method = init_method_normal(initializer_range)
+
+        # Embeddings
+        self.embeddings = RobertaEmbeddings(
+            vocab_size,
+            hidden_size,
+            max_position_embeddings,
+            hidden_dropout_prob,
+            num_tokentypes,
+            pad_token_id,
+            init_method,
+            amp_enabled,
+        )
+
+        # Mask generation
+        self.extended_attn_mask = RobertaExtendedAttnMask()
+        self.pooler = RobertaPooler(hidden_size, init_method) if add_pooling_layer else None
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "hidden_layers": cfg.hidden_layers,
+            "num_attention_heads": cfg.num_attention_heads,
+            "intermediate_size": cfg.intermediate_size,
+            "hidden_dropout_prob": cfg.hidden_dropout_prob,
+            "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
+            "max_position_embeddings": cfg.max_position_embeddings,
+            "num_tokentypes": cfg.num_tokentypes,
+            "add_pooling_layer": cfg.add_pooling_layer,
+            "initializer_range": cfg.initializer_range,
+            "layernorm_eps": cfg.layernorm_eps,
+            "pad_token_id": cfg.pad_token_id,
+            "bias_gelu_fusion": cfg.bias_gelu_fusion,
+            "bias_dropout_fusion": cfg.bias_dropout_fusion,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
+            "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
+            "amp_enabled": cfg.amp_enabled,
+        }
+
+
+class RobertaLMHead(nn.Module):
+    def __init__(self, vocab_size, hidden_size, init_method, layer_norm_eps):
+        super().__init__()
+        self.dense = Linear(
+            hidden_size,
+            hidden_size,
+            bias=True,
+            parallel="data",
+            init_method=init_method,
+            layer_idx=-1,
+        )
+        self.activation_func = build_activation("gelu")
+        self.layernorm = LayerNorm((hidden_size,), eps=layer_norm_eps, layer_idx=-1)
+
+        # NOTE(xzp): LMLogits as a decoder:nn.Linear(hidden_size, vocab_size),
+        # it shares the roberta.word_embeddings.weight
+        self.lm_logits = LMLogits(vocab_size, bias=True)
+
+    def forward(self, hidden_states, word_embeddings_weight):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation_func(hidden_states)
+        hidden_states = hidden_states.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast])
+        )
+        hidden_states = self.layernorm(hidden_states)
+        hidden_states = self.lm_logits(hidden_states, word_embeddings_weight)
+        return hidden_states
+
+
+class RobertaPreTrainedModel(nn.Module):
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.roberta.final_layernorm, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.origin, RobertaEmbeddings):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, RobertaExtendedAttnMask):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                # `add_pooling_layer` in RobertaForMaskedLM and RobertaForCausalLM.
+                # default to False.
+                elif isinstance(module_block.origin, RobertaPooler):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.origin, RobertaLMHead):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.roberta.final_layernorm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                # module.origin can get the original module
+                if isinstance(module_block.to(nn.Module), RobertaEmbeddings):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), RobertaExtendedAttnMask):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                # `add_pooling_layer` in RobertaForMaskedLM and RobertaForCausalLM.
+                # default to False.
+                elif isinstance(module_block.to(nn.Module), RobertaPooler):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.to(nn.Module), RobertaLMHead):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            # Set the last layernorm stage id
+            model.roberta.final_layernorm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+
+class RobertaForPreTraining(RobertaPreTrainedModel):
+    def __init__(self, cfg):
+        super().__init__()
+
+        cfg.add_pooling_layer = False
+        self.roberta = RobertaModel(cfg)
+        self.lm_head = RobertaLMHead(
+            cfg.vocab_size,
+            cfg.hidden_size,
+            init_method_normal(cfg.initializer_range),
+            cfg.layernorm_eps,
+        )
+        self.loss_fc = RobertaLoss()
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        tokentype_ids=None,
+        lm_labels=None,
+        loss_mask=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention on
+                padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
+                and second portions of the inputs. Indices are selected in `[0, 1]`.
+                Defaults to None.
+            labels (flow.LongTensor, optional): Labels for computing the masked
+                language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
+                Defaults to None.
+            loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
+                on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+                Defaults to None.
+        """
+        input_ids = input_ids.to_global(placement=dist.get_layer_placement(0))
+        attention_mask = attention_mask.to_global(placement=dist.get_layer_placement(0))
+        tokentype_ids = tokentype_ids.to_global(placement=dist.get_layer_placement(0))
+
+        outputs = self.roberta(input_ids, attention_mask, tokentype_ids=tokentype_ids)
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output, self.roberta.word_embeddings_weight())
+
+        if lm_labels is not None:
+            return self.loss_fc(prediction_scores, lm_labels, loss_mask)
+
+        return {"prediction_scores": prediction_scores}
+
+
+class RobertaForCausalLM(RobertaPreTrainedModel):
+    def __init__(self, cfg):
+        super().__init__()
+
+        cfg.add_pooling_layer = False
+        self.roberta = RobertaModel(cfg)
+        self.lm_head = RobertaLMHead(
+            cfg.vocab_size,
+            cfg.hidden_size,
+            init_method_normal(cfg.initializer_range),
+            cfg.layernorm_eps,
+        )
+        self.loss_fc = RobertaLoss()
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask,
+        tokentype_ids=None,
+        position_ids=None,
+        labels=None,
+        loss_mask=None,
+    ):
+        """
+
+        Args:
+            input_ids (flow.LongTensor): Indices of input sequence tokens in vocabulary.
+            attention_mask (flow.BoolTensor): Mask to avoid performing attention on
+                padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            tokentype_ids (flow.LongTensor, optional): Segment token indices to indicate first
+                and second portions of the inputs. Indices are selected in `[0, 1]`.
+                Defaults to None.
+            position_ids (flow.LongTensor, optional): Indices of positions of each input sequence
+                tokens in the position embeddings. Defaults to None.
+            labels (flow.LongTensor, optional): Labels for computing the masked
+                language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
+                Defaults to None.
+            loss_mask (flow.BoolTensor, optional): Mask to avoid performing loss computing
+                on ignored tokens. Tokens with indices set to `-1` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+                Defaults to None.
+        """
+        outputs = self.roberta(input_ids, attention_mask, position_ids, tokentype_ids)
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output, self.roberta.word_embeddings_weight())
+
+        if labels is not None:
+            # next-token prediction task, shift prediction_scores and labels by one.
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            shifted_prediction_scores = shifted_prediction_scores.to_global(
+                sbp=prediction_scores.sbp
+            )
+            shifted_labels = labels[:, 1:].contiguous()
+            shifted_labels = shifted_labels.to_global(sbp=shifted_labels.sbp)
+            lm_loss = self.loss_fc(shifted_prediction_scores, shifted_labels, loss_mask)
+            return {"lm_loss": lm_loss}
+
+        return {"prediction_scores": prediction_scores}
--- a/libai/models/swin_transformer.py
+++ b/libai/models/swin_transformer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+import oneflow.nn as nn
+from flowvision.layers import trunc_normal_
+from flowvision.models import to_2tuple
+
+from libai.config.config import configurable
+from libai.layers import MLP, DropPath, LayerNorm, Linear
+from libai.utils import distributed as dist
+
+
+def window_partition(x, window_size):
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional): If True, add a learnable bias to query,key,value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        fused_bias_add_dropout=False,
+        layer_idx=0,
+    ):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            flow.zeros(
+                (2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                num_heads,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = flow.arange(self.window_size[0])
+        coords_w = flow.arange(self.window_size[1])
+        coords = flow.stack(flow.meshgrid(*[coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = flow.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] = (
+            relative_coords[:, :, 0] + self.window_size[0] - 1
+        )  # shift to start from 0
+        relative_coords[:, :, 1] = relative_coords[:, :, 1] + self.window_size[1] - 1
+        relative_coords[:, :, 0] = relative_coords[:, :, 0] * (2 * self.window_size[1] - 1)
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer(
+            "relative_position_index",
+            relative_position_index.to_global(
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            ),
+        )
+
+        self.qkv = Linear(dim, dim * 3, bias=qkv_bias, layer_idx=layer_idx)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = Linear(dim, dim, layer_idx=layer_idx)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(dim=-1)
+        self.fused_bias_add_dropout = fused_bias_add_dropout
+        self.p = proj_drop
+
+    def forward(self, x, mask):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        # attn = flow.matmul(q, k.transpose(-2, -1))
+        attn = flow.matmul(q, k, transpose_b=True)
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1],
+            -1,
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        unsqueeze_relative_position_bias = relative_position_bias.unsqueeze(0)
+        attn = attn + unsqueeze_relative_position_bias
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = flow.matmul(attn, v).transpose(1, 2).reshape(B_, N, C)
+        if self.fused_bias_add_dropout:
+            x = flow._C.matmul(x, self.proj.weight, transpose_a=False, transpose_b=True)
+            x = flow._C.fused_bias_add_dropout(x, self.proj.bias, p=self.p, axis=2)
+        else:
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: libai.layers.LayerNorm
+    """
+
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=LayerNorm,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.layer_idx = layer_idx
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim, layer_idx=layer_idx)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            fused_bias_add_dropout=True,
+            layer_idx=layer_idx,
+        )
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim, layer_idx=layer_idx)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(
+            hidden_size=dim,
+            ffn_hidden_size=mlp_hidden_dim,
+            output_dropout_prob=drop,
+            bias_gelu_fusion=True,
+            bias_dropout_fusion=True,
+            layer_idx=layer_idx,
+        )
+
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = flow.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            w_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt = cnt + 1
+
+            mask_windows = window_partition(
+                img_mask, self.window_size
+            )  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+                attn_mask == 0, float(0.0)
+            )
+            attn_mask = attn_mask.to_global(
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = flow.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, self.attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = flow.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: libai.layers.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=LayerNorm, layer_idx=0):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = Linear(4 * dim, 2 * dim, bias=False, layer_idx=layer_idx)
+        self.norm = norm_layer(4 * dim, layer_idx=layer_idx)
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = flow.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(
+        self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None, layer_idx=0
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [
+            img_size[0] // patch_size[0],
+            img_size[1] // patch_size[1],
+        ]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        ).to_global(
+            placement=dist.get_layer_placement(layer_idx),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+        )
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim, layer_idx=layer_idx)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: libai.layers.LayerNorm
+        downsample (nn.Module | None, optional): Downsample at the end of the layer. Default: None
+    """
+
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        num_heads,
+        window_size,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=LayerNorm,
+        downsample=None,
+        layer_id_offset=0,
+    ):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.layer_id_offset = layer_id_offset
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                    layer_idx=layer_id_offset + i,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution,
+                dim=dim,
+                norm_layer=norm_layer,
+                layer_idx=layer_id_offset + depth - 1,
+            )
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        layer_idx = self.layer_id_offset
+        for i in range(len(self.blocks)):
+            x = x.to_global(placement=dist.get_layer_placement(layer_idx))
+            x = self.blocks[i](x)
+            layer_idx += 1
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+
+class SwinTransformer(nn.Module):
+    """Swin Transformer in LiBai.
+
+    LiBai implement of:
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/pdf/2103.14030>`_
+
+    Args:
+        img_size (int, tuple(int)): Input image size. Default 224
+        patch_size (int, tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: libai.layers.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        loss_func (callable, optional): Loss function for computing the total loss
+                                    between logits and labels
+    """
+
+    @configurable
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=4,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1,
+        norm_layer=LayerNorm,
+        ape=False,
+        patch_norm=True,
+        loss_func=None,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+            layer_idx=0,
+        )
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(
+                flow.zeros(1, num_patches, embed_dim),
+                placement=dist.get_layer_placement(0),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in flow.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        layer_id_offset = 0
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                input_resolution=(
+                    patches_resolution[0] // (2 ** i_layer),
+                    patches_resolution[1] // (2 ** i_layer),
+                ),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                layer_id_offset=layer_id_offset,
+            )
+            layer_id_offset += depths[i_layer]
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features, layer_idx=-1)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.head = (
+            Linear(self.num_features, num_classes, layer_idx=-1)
+            if num_classes > 0
+            else nn.Identity()
+        )
+
+        # Loss func
+        self.loss_func = nn.CrossEntropyLoss() if loss_func is None else loss_func
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "img_size": cfg.img_size,
+            "patch_size": cfg.patch_size,
+            "in_chans": cfg.in_chans,
+            "num_classes": cfg.num_classes,
+            "embed_dim": cfg.embed_dim,
+            "depths": cfg.depths,
+            "num_heads": cfg.num_heads,
+            "window_size": cfg.window_size,
+            "mlp_ratio": cfg.mlp_ratio,
+            "qkv_bias": cfg.qkv_bias,
+            "qk_scale": cfg.qk_scale,
+            "drop_rate": cfg.drop_rate,
+            "drop_path_rate": cfg.drop_path_rate,
+            "ape": cfg.ape,
+            "patch_norm": cfg.patch_norm,
+            "loss_func": cfg.loss_func,
+        }
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x.transpose(1, 2))  # B C 1
+        x = flow.flatten(x, 1)
+        return x
+
+    def forward(self, images, labels=None):
+        """
+
+        Args:
+            images (flow.Tensor): training samples.
+            labels (flow.LongTensor, optional): training targets
+
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation mode.
+                :code:`{"losses": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        x = self.forward_features(images)
+        x = self.head(x)
+
+        if labels is not None and self.training:
+            losses = self.loss_func(x, labels)
+            return {"losses": losses}
+        else:
+            return {"prediction_scores": x}
+
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.patch_embed, "config"):
+            # Old API in OneFlow 0.8
+            model.patch_embed.config.set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.pos_drop.config.set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+
+            for module_block in model.modules():
+                if isinstance(module_block.origin, SwinTransformerBlock):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.origin, PatchMerging):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            model.norm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.avgpool.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            model.patch_embed.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.pos_drop.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), SwinTransformerBlock):
+                    module_block.to(flow.nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.to(nn.Module), PatchMerging):
+                    module_block.to(flow.nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            model.norm.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.avgpool.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+    @staticmethod
+    def set_activation_checkpoint(model):
+        for module_block in model.modules():
+            if hasattr(module_block, "origin"):
+                # Old API in OneFlow 0.8
+                if isinstance(module_block.origin, SwinTransformerBlock):
+                    module_block.config.activation_checkpointing = True
+            else:
+                if isinstance(module_block.to(nn.Module), SwinTransformerBlock):
+                    module_block.to(flow.nn.graph.GraphModule).activation_checkpointing = True
--- a/libai/models/swin_transformer_v2.py
+++ b/libai/models/swin_transformer_v2.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import oneflow as flow
+import oneflow.nn as nn
+import oneflow.nn.functional as F
+from flowvision.layers import trunc_normal_
+from flowvision.models import to_2tuple
+
+from libai.config.config import configurable
+from libai.layers import MLP, DropPath, LayerNorm, Linear
+from libai.utils import distributed as dist
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    r"""Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value.
+                                    Default: True
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        pretrained_window_size (tuple[int]): The height and width of the window in pre-training.
+    """
+
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        pretrained_window_size=[0, 0],
+        fused_bias_add_dropout=False,
+        layer_idx=0,
+    ):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.pretrained_window_size = pretrained_window_size
+        self.fused_bias_add_dropout = fused_bias_add_dropout
+        self.num_heads = num_heads
+        self.layer_idx = layer_idx
+        self.p = proj_drop
+        self.logit_scale = nn.Parameter(
+            flow.log(
+                10
+                * flow.ones(
+                    1,
+                    num_heads,
+                    1,
+                    1,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+            ),
+            requires_grad=True,
+        )
+
+        # NOTE: generate meta network, using mlp to generate continuous relative position bias
+        self.cpb_mlp = nn.Sequential(
+            Linear(2, 512, bias=True, layer_idx=layer_idx),
+            nn.ReLU(inplace=True),
+            Linear(512, num_heads, bias=False, layer_idx=layer_idx),
+        ).to_global(
+            placement=dist.get_layer_placement(layer_idx),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+        )
+
+        # NOTE: get relative_coords_table
+        relative_coords_h = flow.arange(
+            -(self.window_size[0] - 1), self.window_size[0], dtype=flow.float32
+        )
+        relative_coords_w = flow.arange(
+            -(self.window_size[1] - 1), self.window_size[1], dtype=flow.float32
+        )
+        relative_coords_table = (
+            flow.stack(flow.meshgrid(*[relative_coords_h, relative_coords_w]))
+            .permute(1, 2, 0)
+            .contiguous()
+            .unsqueeze(0)
+        )  # 1, 2*Wh-1, 2*Ww-1, 2
+
+        # NOTE: For any relative coordinate, constrain it to -8~8 (window size)
+        if pretrained_window_size[0] > 0:
+            relative_coords_table[:, :, :, 0] = relative_coords_table[:, :, :, 0] / (
+                pretrained_window_size[0] - 1
+            )
+            relative_coords_table[:, :, :, 1] = relative_coords_table[:, :, :, 1] / (
+                pretrained_window_size[1] - 1
+            )
+        else:
+            relative_coords_table[:, :, :, 0] = relative_coords_table[:, :, :, 0] / (
+                self.window_size[0] - 1
+            )
+            relative_coords_table[:, :, :, 1] = relative_coords_table[:, :, :, 1] / (
+                self.window_size[1] - 1
+            )
+        relative_coords_table = relative_coords_table * 8
+        # NOTE: y=sign(x)*log(|x|+1)
+        relative_coords_table = (
+            flow.sign(relative_coords_table)
+            * flow.log2(flow.abs(relative_coords_table) + 1.0)
+            / math.log2(8.0)
+        )
+        self.register_buffer(
+            "relative_coords_table",
+            relative_coords_table.to_global(
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            ),
+        )
+
+        # NOTE: get pair-wise relative position index for each token inside the window
+        coords_h = flow.arange(self.window_size[0])
+        coords_w = flow.arange(self.window_size[1])
+        coords = flow.stack(flow.meshgrid(*[coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = flow.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] = (
+            relative_coords[:, :, 0] + self.window_size[0] - 1
+        )  # shift to start from 0
+        relative_coords[:, :, 1] = relative_coords[:, :, 1] + self.window_size[1] - 1
+        relative_coords[:, :, 0] = relative_coords[:, :, 0] * (2 * self.window_size[1] - 1)
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer(
+            "relative_position_index",
+            relative_position_index.to_global(
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            ),
+        )
+
+        self.qkv = Linear(dim, dim * 3, bias=False, layer_idx=layer_idx)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(
+                flow.zeros(
+                    dim,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+            )
+            self.v_bias = nn.Parameter(
+                flow.zeros(
+                    dim,
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+            )
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = Linear(dim, dim, layer_idx=layer_idx)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = flow.concat(
+                [
+                    self.q_bias,
+                    flow.zeros(
+                        self.v_bias.shape,
+                        requires_grad=False,
+                        placement=dist.get_layer_placement(
+                            self.layer_idx, device_type=self.v_bias.placement.type
+                        ),
+                        sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                    ),
+                    self.v_bias,
+                ],
+                dim=0,
+            )
+        qkv = self.qkv(x) + qkv_bias.unsqueeze(0).unsqueeze(0)
+        qkv = qkv.reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        # NOTE: cosine attention
+        attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)
+
+        # NOTE: a learnable scalar
+        logit_scale = flow.clamp(self.logit_scale, min=-1e6, max=math.log(1.0 / 0.01)).exp()
+        attn = attn * logit_scale
+
+        # NOTE: use relative_coords_table and meta network to generate relative_position_bias
+        relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(
+            -1, self.num_heads
+        )
+        relative_position_bias = relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+        # NOTE: constrained to a range of -16~16
+        relative_position_bias = 16 * flow.sigmoid(relative_position_bias).unsqueeze(0)
+        attn = attn + relative_position_bias
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        if self.fused_bias_add_dropout:
+            x = flow._C.matmul(x, self.proj.weight, transpose_a=False, transpose_b=True)
+            x = flow._C.fused_bias_add_dropout(x, self.proj.bias, p=self.p, axis=2)
+        else:
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    r"""Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        pretrained_window_size (int): Window size in pre-training.
+    """
+
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=LayerNorm,
+        pretrained_window_size=0,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.layer_idx = layer_idx
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim, layer_idx=layer_idx)
+
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            pretrained_window_size=to_2tuple(pretrained_window_size),
+            fused_bias_add_dropout=True,
+            layer_idx=layer_idx,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim, layer_idx=layer_idx)
+
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(
+            hidden_size=dim,
+            ffn_hidden_size=mlp_hidden_dim,
+            output_dropout_prob=drop,
+            bias_gelu_fusion=True,
+            bias_dropout_fusion=True,
+            layer_idx=layer_idx,
+        )
+
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = flow.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            w_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt = cnt + 1
+
+            mask_windows = window_partition(
+                img_mask, self.window_size
+            )  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = (
+                attn_mask.masked_fill(attn_mask != 0, float(-100.0))
+                .masked_fill(attn_mask == 0, float(0.0))
+                .to_global(
+                    placement=dist.get_layer_placement(layer_idx),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+            )
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = flow.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = flow.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+        # NOTE: res-post-norm
+        x = shortcut + self.drop_path(self.norm1(x))
+
+        # NOTE: res-post-norm
+        x = x + self.drop_path(self.norm2(self.mlp(x)))
+        return x
+
+
+class PatchMerging(nn.Module):
+    """Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: libai.layers.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=LayerNorm, layer_idx=0):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = Linear(4 * dim, 2 * dim, bias=False, layer_idx=layer_idx)
+
+        # NOTE: swinv2-> 2*dim, swin-> 4*dim
+        self.norm = norm_layer(2 * dim, layer_idx=layer_idx)
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = flow.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        # NOTE: post-res-norm, a change that swin-v2 compared to swin
+        x = self.reduction(x)
+        x = self.norm(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer.
+                                                                            Default: None
+        pretrained_window_size (int): Local window size in pre-training.
+    """
+
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        num_heads,
+        window_size,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=LayerNorm,
+        downsample=None,
+        pretrained_window_size=0,
+        layer_id_offset=0,
+    ):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.layer_id_offset = layer_id_offset
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                    pretrained_window_size=pretrained_window_size,
+                    layer_idx=layer_id_offset + i,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution,
+                dim=dim,
+                norm_layer=norm_layer,
+                layer_idx=layer_id_offset + depth - 1,
+            )
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        layer_idx = self.layer_id_offset
+        for blk in self.blocks:
+            x = x.to_global(
+                placement=dist.get_layer_placement(layer_idx, device_type=x.placement.type)
+            )
+            x = blk(x)
+            layer_idx += 1
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+    def _init_respostnorm(self):
+        for blk in self.blocks:
+            nn.init.constant_(blk.norm1.bias, 0)
+            nn.init.constant_(blk.norm1.weight, 0)
+            nn.init.constant_(blk.norm2.bias, 0)
+            nn.init.constant_(blk.norm2.weight, 0)
+
+
+class PatchEmbed(nn.Module):
+    r"""Image to Patch Embedding
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(
+        self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None, layer_idx=0
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        ).to_global(
+            placement=dist.get_layer_placement(layer_idx),
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+        )
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model \
+        ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+
+class SwinTransformerV2(nn.Module):
+    r"""Swin Transformer
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        pretrained_window_sizes (tuple(int)): Pretrained window sizes of each layer.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=4,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1,
+        norm_layer=LayerNorm,
+        ape=False,
+        patch_norm=True,
+        pretrained_window_sizes=[0, 0, 0, 0],
+        loss_func=None,
+    ):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+            layer_idx=0,
+        )
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(
+                flow.zeros(
+                    1,
+                    num_patches,
+                    embed_dim,
+                    placement=dist.get_layer_placement(0),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in flow.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        layer_id_offset = 0
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                input_resolution=(
+                    patches_resolution[0] // (2 ** i_layer),
+                    patches_resolution[1] // (2 ** i_layer),
+                ),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                pretrained_window_size=pretrained_window_sizes[i_layer],
+                layer_id_offset=layer_id_offset,
+            )
+            layer_id_offset += depths[i_layer]
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features, layer_idx=-1)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.head = (
+            Linear(self.num_features, num_classes, layer_idx=-1)
+            if num_classes > 0
+            else nn.Identity()
+        )
+        self.loss_func = nn.CrossEntropyLoss() if loss_func is None else loss_func
+
+        self.apply(self._init_weights)
+        for bly in self.layers:
+            bly._init_respostnorm()
+
+    def _init_weights(self, m):
+        if isinstance(m, Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "img_size": cfg.img_size,
+            "patch_size": cfg.patch_size,
+            "in_chans": cfg.in_chans,
+            "num_classes": cfg.num_classes,
+            "embed_dim": cfg.embed_dim,
+            "depths": cfg.depths,
+            "num_heads": cfg.num_heads,
+            "window_size": cfg.window_size,
+            "mlp_ratio": cfg.mlp_ratio,
+            "qkv_bias": cfg.qkv_bias,
+            "drop_rate": cfg.drop_rate,
+            "drop_path_rate": cfg.drop_path_rate,
+            "ape": cfg.ape,
+            "patch_norm": cfg.patch_norm,
+            "pretrained_window_sizes": cfg.pretrained_window_sizes,
+            "loss_func": cfg.loss_func,
+        }
+
+    def forward_features(self, x):
+
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x.transpose(1, 2))  # B C 1
+        x = flow.flatten(x, 1)
+        return x
+
+    def forward(self, images, labels=None):
+        """
+
+        Args:
+            images (flow.Tensor): training samples.
+            labels (flow.LongTensor, optional): training targets
+
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation mode.
+                :code:`{"losses": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        x = self.forward_features(images)
+        x = self.head(x)
+
+        if labels is not None and self.training:
+            losses = self.loss_func(x, labels)
+            return {"losses": losses}
+        else:
+            return {"prediction_scores": x}
+
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.patch_embed, "config"):
+            # Old API in OneFlow 0.8
+            model.patch_embed.config.set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.pos_drop.config.set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+
+            for module_block in model.modules():
+                if isinstance(module_block.origin, SwinTransformerBlock):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.origin, PatchMerging):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            model.norm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.avgpool.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            model.patch_embed.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.pos_drop.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), SwinTransformerBlock):
+                    module_block.to(flow.nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.to(nn.Module), PatchMerging):
+                    module_block.to(flow.nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+
+            model.norm.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.avgpool.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+
+    @staticmethod
+    def set_activation_checkpoint(model):
+        for module_block in model.modules():
+            if hasattr(module_block, "origin"):
+                # Old API in OneFlow 0.8
+                if isinstance(module_block.origin, SwinTransformerBlock):
+                    module_block.config.activation_checkpointing = True
+            else:
+                if isinstance(module_block.to(nn.Module), SwinTransformerBlock):
+                    module_block.to(flow.nn.graph.GraphModule).activation_checkpointing = True
--- a/libai/models/t5_model.py
+++ b/libai/models/t5_model.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+import oneflow.nn as nn
+
+from libai.config import configurable
+from libai.layers import (
+    Embedding,
+    LayerNorm,
+    LMLogits,
+    ParallelCrossEntropyLoss,
+    TransformerLayer,
+    VocabEmbedding,
+)
+from libai.layers.attention import AttnMaskType
+from libai.models.utils import init_method_normal, scaled_init_method_normal
+from libai.utils import distributed as dist
+
+
+class ExtendedMask(flow.nn.Module):
+    def forward(self, attention_mask):
+        return attention_mask.unsqueeze(1)
+
+
+class T5Embedding(flow.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        max_sequence_length,
+        embedding_dropout_prob,
+        init_method=flow.nn.init.xavier_normal_,
+        amp_enabled=False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+
+        self.word_embeddings = VocabEmbedding(
+            num_embeddings=vocab_size,
+            embedding_dim=hidden_size,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+        )
+        self.position_embeddings = Embedding(
+            num_embeddings=max_sequence_length,
+            embedding_dim=hidden_size,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+        )
+        self.position_ids = flow.arange(
+            max_sequence_length,
+            dtype=flow.long,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=dist.get_layer_placement(0),
+        ).unsqueeze(0)
+
+        self.embedding_dropout = flow.nn.Dropout(embedding_dropout_prob)
+
+    def forward(self, input_ids, past_length=0):
+        seq_length = input_ids.size()[1]
+
+        position_ids = self.position_ids[:, past_length : past_length + seq_length]
+        position_ids = position_ids.expand_as(input_ids).to_global(sbp=input_ids.sbp)
+
+        word_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = word_embeddings + position_embeddings
+        embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+
+
+class T5Model(flow.nn.Module):
+    """T5 Model that outputs logits.
+
+    Args:
+        vocab_size (int): The size of vocabulary file.
+        hidden_size (int): The size of hidden states.
+        hidden_layers (int): The number of ``TransformerLayer`` in the encoder and decoder.
+        num_attention_heads (int):
+            The number of attention heads for each attention layer of ``TransformerLayer``.
+        intermediate_size (int):
+            The size of intermediate layer in feed-forward network for each ``TransformerLayer``.
+        embedding_dropout_prob (float): The dropout ratio for the output of T5Embedding Layer.
+        hidden_dropout_prob (float): The dropout ratio for the output for each ``TransformerLayer``.
+        attention_probs_dropout_prob (float):
+            The dropout ratio for the output of each attention layer in ``TransformerLayer``.
+        max_position_embeddings (int):
+            Max sequence length of input, defines the shape of Position Embeddings
+            in ``T5Emebedding``.
+        initializer_range (float, optional):
+            Sigma of the normal distribution in the initialization method. Defaults to 0.02.
+        layernorm_eps (float, optional): The epsilon of LayerNorm layer. Defaults to 1e-12.
+        bias_gelu_fusion (bool, optional):
+            Whether or not to fuse the computing of bias and gelu. Defaults to ``False``.
+        bias_dropout_fusion (bool, optional):
+            Whether or not to fuse the computing of dropout and bias. Defaults to ``False``.
+        scale_mask_softmax_fusion (bool, optional):
+            Whether to fuse the computing of mask and softmax in attention layers.
+            Defaults to ``False``.
+        apply_query_key_layer_scaling (bool, optional):
+            Whether or not to use layer index related scaling in computing attention scores.
+            If ``True``, the scaling factor equals to sqrt(d) * (layer_index + 1).
+            Defaults to ``True``.
+        apply_residual_post_layernorm (bool, optional):
+            If set ``True``, use original BERT residual connection ordering otherwise use Megatron
+            BERT residual connection which is more stable when scaling model size introduced in
+            https://arxiv.org/pdf/1909.08053.pdf.
+            Default: ``False``.
+        amp_enabled (bool, optional):
+            Whether or not to set fp16 for embedding weight in T5 model. Defaults to ``False``.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        vocab_size,
+        hidden_size,
+        hidden_layers,
+        num_attention_heads,
+        intermediate_size,
+        embedding_dropout_prob,
+        hidden_dropout_prob,
+        attention_probs_dropout_prob,
+        max_position_embeddings,
+        initializer_range=0.02,
+        layernorm_eps=1e-12,
+        bias_gelu_fusion=False,
+        bias_dropout_fusion=False,
+        scale_mask_softmax_fusion=False,
+        apply_query_key_layer_scaling=True,
+        apply_residual_post_layernorm=False,
+        amp_enabled=False,
+    ) -> None:
+        super().__init__()
+        init_method = init_method_normal(initializer_range)
+        scaled_init_method = scaled_init_method_normal(initializer_range, hidden_layers)
+        self.embedding = T5Embedding(
+            hidden_size=hidden_size,
+            vocab_size=vocab_size,
+            max_sequence_length=max_position_embeddings,
+            embedding_dropout_prob=embedding_dropout_prob,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+        )
+        self.extended_attn_mask = ExtendedMask()
+
+        encoder_layers = flow.nn.ModuleList(
+            [
+                TransformerLayer(
+                    hidden_size=hidden_size,
+                    ffn_hidden_size=intermediate_size,
+                    num_attention_heads=num_attention_heads,
+                    is_decoder=False,
+                    attention_dropout_prob=attention_probs_dropout_prob,
+                    output_dropout_prob=hidden_dropout_prob,
+                    layernorm_epsilon=layernorm_eps,
+                    init_method=init_method,
+                    output_layer_init_method=scaled_init_method,
+                    bias_gelu_fusion=bias_gelu_fusion,
+                    bias_dropout_fusion=bias_dropout_fusion,
+                    scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+                    apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+                    apply_residual_post_layernorm=apply_residual_post_layernorm,
+                    attn_mask_type=AttnMaskType.padding,
+                    layer_idx=i,
+                )
+                for i in range(hidden_layers)
+            ]
+        )
+
+        encoder_final_layernorm = LayerNorm(
+            (hidden_size,),
+            eps=layernorm_eps,
+            layer_idx=hidden_layers - 1,
+        )
+
+        self.encoder = flow.nn.Sequential()
+        self.encoder.add_module("layers", encoder_layers)
+        self.encoder.add_module("final_layernorm", encoder_final_layernorm)
+
+        decoder_layers = flow.nn.ModuleList(
+            [
+                TransformerLayer(
+                    hidden_size=hidden_size,
+                    ffn_hidden_size=intermediate_size,
+                    num_attention_heads=num_attention_heads,
+                    is_decoder=True,
+                    attention_dropout_prob=attention_probs_dropout_prob,
+                    output_dropout_prob=hidden_dropout_prob,
+                    layernorm_epsilon=layernorm_eps,
+                    init_method=init_method,
+                    output_layer_init_method=scaled_init_method,
+                    bias_gelu_fusion=bias_gelu_fusion,
+                    bias_dropout_fusion=bias_dropout_fusion,
+                    scale_mask_softmax_fusion=scale_mask_softmax_fusion,
+                    apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+                    attn_mask_type=AttnMaskType.padding,
+                    layer_idx=i,
+                )
+                for i in range(hidden_layers, 2 * hidden_layers)
+            ]
+        )
+
+        decoder_final_layernorm = LayerNorm(
+            (hidden_size,),
+            eps=layernorm_eps,
+            layer_idx=2 * hidden_layers - 1,
+        )
+
+        self.decoder = flow.nn.Sequential()
+        self.decoder.add_module("layers", decoder_layers)
+        self.decoder.add_module("final_layernorm", decoder_final_layernorm)
+        self.past_key_values = [None] * len(self.decoder.layers)
+        self.encoder_states = None
+        self.past_length = 0
+
+        self.lm_head = LMLogits(vocab_size, bias=True)
+
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "vocab_size": cfg.vocab_size,
+            "hidden_size": cfg.hidden_size,
+            "hidden_layers": cfg.hidden_layers,
+            "num_attention_heads": cfg.num_attention_heads,
+            "intermediate_size": cfg.intermediate_size,
+            "embedding_dropout_prob": cfg.embedding_dropout_prob,
+            "hidden_dropout_prob": cfg.hidden_dropout_prob,
+            "attention_probs_dropout_prob": cfg.attention_probs_dropout_prob,
+            "max_position_embeddings": cfg.max_position_embeddings,
+            "initializer_range": cfg.initializer_range,
+            "layernorm_eps": cfg.layernorm_eps,
+            "bias_gelu_fusion": cfg.bias_gelu_fusion,
+            "bias_dropout_fusion": cfg.bias_dropout_fusion,
+            "scale_mask_softmax_fusion": cfg.scale_mask_softmax_fusion,
+            "apply_query_key_layer_scaling": cfg.apply_query_key_layer_scaling,
+            "apply_residual_post_layernorm": cfg.apply_residual_post_layernorm,
+            "amp_enabled": cfg.amp_enabled,
+        }
+
+    def forward(
+        self,
+        encoder_input_ids,
+        decoder_input_ids,
+        encoder_attn_mask,
+        decoder_attn_mask,
+        encoder_decoder_attn_mask,
+        use_cache=False,
+    ):
+        """
+
+        Args:
+            encoder_input_ids (flow.LongTensor):
+                Indices of input sequence tokens in vocabulary for encoder.
+            decoder_input_ids (flow.LongTensor):
+                Indices of input sequence tokens in vocabulary for decoder.
+            encoder_attn_mask (flow.BoolTensor):
+                Mask for encoder to avoid performing attention on
+                padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            decoder_attn_mask (flow.BoolTensor):
+                Mask for decoder to avoid performing attention on subsequent token indices.
+                Mask values have the same meaning as encoder_attn_mask.
+            encoder_decoder_attn_mask (flow.BoolTensor):
+                Mask for decoder to avoid performing attention on encoder padded token indices.
+                Mask values have the same meaning as encoder_attn_mask.
+            use_cache (bool, optional):
+                It will be set to True, when the model is in the inference
+                phase and used for incremental decoding. Defaults to False.
+
+        Returns:
+            flow.Tensor: logits
+        """
+
+        encoder_input_ids = encoder_input_ids.to_global(placement=dist.get_layer_placement(0))
+        decoder_input_ids = decoder_input_ids.to_global(placement=dist.get_layer_placement(0))
+        encoder_attn_mask = encoder_attn_mask.to_global(placement=dist.get_layer_placement(0))
+        decoder_attn_mask = decoder_attn_mask.to_global(placement=dist.get_layer_placement(0))
+        encoder_decoder_attn_mask = encoder_decoder_attn_mask.to_global(
+            placement=dist.get_layer_placement(0)
+        )
+        if use_cache and self.encoder_states is not None:
+            encoder_states = self.encoder_states
+        else:
+            self.set_cache(encoder_states=None, past_key_values=None)
+            encoder_attn_mask = self.extended_attn_mask(encoder_attn_mask)
+            enc_embedding_output = self.embedding(encoder_input_ids)
+            enc_hidden_states = enc_embedding_output
+            for layer in self.encoder.layers:
+                enc_hidden_states = layer(enc_hidden_states, encoder_attn_mask)
+            encoder_states = self.encoder.final_layernorm(enc_hidden_states)
+
+        decoder_attn_mask = self.extended_attn_mask(decoder_attn_mask)
+        encoder_decoder_attn_mask = self.extended_attn_mask(encoder_decoder_attn_mask)
+        dec_embedding_output = self.embedding(decoder_input_ids, self.past_length)
+        dec_hidden_states = dec_embedding_output
+        if use_cache:
+            presents = []
+        for layer, past_key_value in zip(self.decoder.layers, self.past_key_values):
+            dec_hidden_states = layer(
+                dec_hidden_states,
+                decoder_attn_mask,
+                encoder_states,
+                encoder_decoder_attn_mask,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+            )
+            if use_cache:
+                dec_hidden_states, present = dec_hidden_states
+                presents.append(present)
+        if use_cache:
+            self.set_cache(encoder_states, past_key_values=presents)
+
+        decoder_states = self.decoder.final_layernorm(dec_hidden_states)
+        logits = self.lm_head(decoder_states, self.embedding.word_embeddings.weight)
+        return logits
+
+    def set_cache(self, encoder_states, past_key_values):
+        self.encoder_states = encoder_states
+        self.past_length = 0 if past_key_values is None else past_key_values[0][0].shape[2]
+
+        if past_key_values is None:
+            past_key_values = [None] * len(self.decoder.layers)
+        assert len(past_key_values) == len(self.decoder.layers), (
+            f"past_key_values's length {len(past_key_values)} doesn't match "
+            f"decoder num_layers' length {self.decoder.layers}"
+        )
+        self.past_key_values = past_key_values
+
+
+class T5Loss(flow.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.lm_loss = ParallelCrossEntropyLoss()
+
+    def forward(self, logits, lm_labels, loss_mask):
+        lm_loss = self.lm_loss(logits, lm_labels)
+        loss_mask = loss_mask.to_global(placement=lm_loss.placement)
+        loss_mask = loss_mask.float()
+        denominator = loss_mask.sum().to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])
+        )
+        lm_loss = flow._C.amp_white_identity(lm_loss)
+        lm_loss = flow._C.amp_black_identity(lm_loss)
+        masked_lm_loss = flow.sum(lm_loss.view(-1) * loss_mask.view(-1)) / denominator
+        masked_lm_loss = masked_lm_loss.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.partial_sum, flow.sbp.broadcast])
+        )
+        return {"masked_lm_loss": masked_lm_loss}
+
+
+class T5ForPreTraining(flow.nn.Module):
+    """
+    T5 Model with classification head on top.
+    """
+
+    def __init__(self, cfg) -> None:
+        super().__init__()
+        self.t5_model = T5Model(cfg)
+        self.loss_func = T5Loss()
+
+    def set_cache(self, encoder_states, past_key_values):
+        self.t5_model.set_cache(encoder_states, past_key_values)
+
+    def forward(
+        self,
+        encoder_input_ids,
+        decoder_input_ids,
+        encoder_attn_mask,
+        decoder_attn_mask,
+        encoder_decoder_attn_mask,
+        lm_labels=None,
+        loss_mask=None,
+        use_cache=False,
+    ):
+        """
+
+        Args:
+            encoder_input_ids (flow.LongTensor):
+                Indices of input sequence tokens in vocabulary for encoder.
+            decoder_input_ids (flow.LongTensor):
+                Indices of input sequence tokens in vocabulary for decoder.
+            encoder_attn_mask (flow.BoolTensor):
+                Mask for encoder to avoid performing attention on
+                padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            decoder_attn_mask (flow.BoolTensor):
+                Mask for decoder to avoid performing attention on subsequent token indices.
+                Mask values have the same meaning as encoder_attn_mask.
+            encoder_decoder_attn_mask (flow.BoolTensor):
+                Mask for decoder to avoid performing attention on encoder padded token indices.
+                Mask values have the same meaning as encoder_attn_mask.
+            lm_labels (flow.LongTensor, optional): Labels for computing the masked
+                language modeling loss. Indices should be in `[-1, 0, ..., config.vocab_size]`.
+                None for evaluating.
+            loss_mask (flow.BoolTensor, optional):
+                Mask to avoid performing loss computing on ignored tokens.
+                Tokens with indices set to `-1` are ignored (masked), the loss is only computed
+                for the tokens with labels in `[0, ..., config.vocab_size]`.
+                None for evaluating.
+            use_cache (bool, optional):
+                It will be set to True, when the model is in the inference
+                phase and used for incremental decoding. Defaults to False.
+
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation mode.
+                :code:`{"masked_lm_loss": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        logits = self.t5_model(
+            encoder_input_ids,
+            decoder_input_ids,
+            encoder_attn_mask,
+            decoder_attn_mask,
+            encoder_decoder_attn_mask,
+            use_cache=use_cache,
+        )
+
+        if lm_labels is not None:
+            lm_loss = self.loss_func(logits, lm_labels, loss_mask)
+            return lm_loss
+        else:
+            return {
+                "prediction_scores": logits,
+            }
+
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+
+        # Set pipeline parallelism stage_id
+        if hasattr(model.t5_model.encoder.final_layernorm, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                if isinstance(module_block.origin, T5Embedding):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, ExtendedMask):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.origin, LMLogits):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.origin, T5Loss):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            model.t5_model.encoder.final_layernorm.config.set_stage(
+                dist_utils.get_layer_stage_id(model.t5_model.encoder.final_layernorm.layer_idx),
+                dist.get_layer_placement(model.t5_model.encoder.final_layernorm.layer_idx),
+            )
+            model.t5_model.decoder.final_layernorm.config.set_stage(
+                dist_utils.get_layer_stage_id(model.t5_model.decoder.final_layernorm.layer_idx),
+                dist.get_layer_placement(model.t5_model.decoder.final_layernorm.layer_idx),
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), T5Embedding):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), ExtendedMask):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+                elif isinstance(module_block.to(nn.Module), LMLogits):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+                elif isinstance(module_block.to(nn.Module), T5Loss):
+                    module_block.to(nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+                    )
+
+            model.t5_model.encoder.final_layernorm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(model.t5_model.encoder.final_layernorm.layer_idx),
+                dist.get_layer_placement(model.t5_model.encoder.final_layernorm.layer_idx),
+            )
+            model.t5_model.decoder.final_layernorm.to(nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(model.t5_model.decoder.final_layernorm.layer_idx),
+                dist.get_layer_placement(model.t5_model.decoder.final_layernorm.layer_idx),
+            )
--- a/libai/models/utils/__init__.py
+++ b/libai/models/utils/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .graph_base import GraphBase
+from .weight_init import init_method_normal, scaled_init_method_normal
+from .model_utils.base_loader import ModelLoaderHuggerFace, ModelLoaderLiBai
+from .model_utils.bert_loader import BertLoaderHuggerFace, BertLoaderLiBai
+from .model_utils.roberta_loader import RobertaLoaderHuggerFace, RobertaLoaderLiBai
+from .model_utils.gpt_loader import GPT2LoaderHuggerFace, GPT2LoaderLiBai
+from .model_utils.swin_loader import SwinLoaderHuggerFace, SwinLoaderLiBai
+from .model_utils.swinv2_loader import SwinV2LoaderHuggerFace, SwinV2LoaderLiBai
+from .model_utils.vit_loader import ViTLoaderHuggerFace, ViTLoaderLiBai