bert-large

5988d2cc · yuguo960516 · 478602ba · 5988d2cc · 5988d2cc · 5988d2cc
Commit 5988d2cc authored Mar 29, 2023 by yuguo960516
20 changed files
--- a/libai/models/utils/model_utils/vit_loader.py
+++ b/libai/models/utils/model_utils/vit_loader.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import oneflow as flow
+from .base_loader import ModelLoaderHuggerFace, ModelLoaderLiBai
+class ViTLoaderHuggerFace(ModelLoaderHuggerFace):
+    def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
+        super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
+        """NOTE: base_model_prefix_1 is ViT's prefix in Transformers.
+        base_model_prefix_2 is ViT's prefix in LiBai."""
+        self.base_model_prefix_1 = "vit"
+        self.base_model_prefix_2 = ""
+    def _convert_state_dict(self, flow_state_dict, cfg=None):
+        """Convert state_dict's keys to match model.
+        Args:
+            flow_state_dict (OrderedDict): model state dict.
+            cfg (dict): model's default config dict.
+        Returns:
+            OrderedDict: flow state dict.
+        """
+        # The converted checkpoint.
+        oneflow_state_dict = flow_state_dict.copy()
+        # Get configs
+        num_heads = cfg.get("num_heads")
+        hidden_size = cfg.get("embed_dim")
+        head_size = int(hidden_size / num_heads)
+        # prefix
+        has_prefix = any(s.startswith(self.base_model_prefix_1) for s in oneflow_state_dict)
+        index_idx = 3 if has_prefix else 2
+        old_keys = oneflow_state_dict.keys()
+        for key in list(old_keys):
+            # Convert vit's embedding layers
+            if "embeddings" in key:
+                if "cls_token" in key:
+                    new_key = "cls_token"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+                elif "position_embeddings" in key:
+                    new_key = "pos_embed"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+                elif "patch_embeddings.projection" in key:
+                    if "weight" in key:
+                        new_key = "patch_embed.proj.weight"
+                        oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+                    elif "bias" in key:
+                        new_key = "patch_embed.proj.bias"
+                        oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+            # Convert vit's layernorm layers
+            elif "layernorm_before" in key:
+                index_block = key.split(".")[index_idx]
+                if "weight" in key:
+                    new_key = "blocks." + index_block + ".input_layernorm.weight"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+                elif "bias" in key:
+                    new_key = "blocks." + index_block + ".input_layernorm.bias"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+            elif "layernorm_after" in key:
+                index_block = key.split(".")[index_idx]
+                if "weight" in key:
+                    new_key = "blocks." + index_block + ".post_attention_layernorm.weight"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+                elif "bias" in key:
+                    new_key = "blocks." + index_block + ".post_attention_layernorm.bias"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+            # Convert vit's attention layers
+            elif "attention" in key:
+                index_block = key.split(".")[index_idx]
+                if "attention.attention" in key:
+                    if (
+                        "blocks." + index_block + ".self_attention.query_key_value.weight"
+                        in oneflow_state_dict.keys()
+                    ):
+                        continue
+                    q_w = key
+                    k_w = q_w.replace("query", "key")
+                    v_w = q_w.replace("query", "value")
+                    q_b = q_w.replace("weight", "bias")
+                    k_b = k_w.replace("weight", "bias")
+                    v_b = v_w.replace("weight", "bias")
+                    qkv_w = flow.cat(
+                        (
+                            oneflow_state_dict.pop(q_w),
+                            oneflow_state_dict.pop(k_w),
+                            oneflow_state_dict.pop(v_w),
+                        ),
+                        dim=0,
+                    )
+                    qkv_b = flow.cat(
+                        (
+                            oneflow_state_dict.pop(q_b),
+                            oneflow_state_dict.pop(k_b),
+                            oneflow_state_dict.pop(v_b),
+                        ),
+                        dim=-1,
+                    )
+                    qkv_w = self._fix_qkv_ordering(qkv_w, head_size, num_heads)
+                    qkv_b = self._fix_qkv_ordering(qkv_b, head_size, num_heads)
+                    new_key = "blocks." + index_block + ".self_attention.query_key_value.weight"
+                    oneflow_state_dict[new_key] = qkv_w
+                    new_key = new_key.replace("weight", "bias")
+                    oneflow_state_dict[new_key] = qkv_b
+                elif "output" in key:
+                    if "dense" in key:
+                        if "weight" in key:
+                            new_key = "blocks." + index_block + ".self_attention.dense.weight"
+                            oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+                        if "bias" in key:
+                            new_key = "blocks." + index_block + ".self_attention.dense.bias"
+                            oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+            elif "intermediate" in key:
+                index_block = key.split(".")[index_idx]
+                if "weight" in key:
+                    if (
+                        "blocks." + index_block + ".mlp.dense_h_to_4h.weight"
+                        in oneflow_state_dict.keys()
+                    ):
+                        continue
+                    w = key
+                    b = key.replace("weight", "bias")
+                    new_key = "blocks." + index_block + ".mlp.dense_h_to_4h.weight"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(w)
+                    new_key = new_key.replace("weight", "bias")
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(b)
+            elif "output" in key:
+                index_block = key.split(".")[index_idx]
+                if "dense.weight" in key:
+                    if (
+                        "blocks." + index_block + ".mlp.dense_4h_to_h.weight"
+                        in oneflow_state_dict.keys()
+                    ):
+                        continue
+                    w = key
+                    b = w.replace("weight", "bias")
+                    new_key = "blocks." + index_block + ".mlp.dense_4h_to_h.weight"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(w)
+                    new_key = new_key.replace("weight", "bias")
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(b)
+            elif "layernorm" in key:
+                if "weight" in key:
+                    new_key = "norm.weight"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+                elif "bias" in key:
+                    new_key = "norm.bias"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+            elif "classifier" in key:
+                if "weight" in key:
+                    new_key = "head.weight"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+                elif "bias" in key:
+                    new_key = "head.bias"
+                    oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
+            else:
+                oneflow_state_dict[key] = oneflow_state_dict.pop(key)
+        return oneflow_state_dict
+    def _load_config_from_json(self, config_file):
+        """load config from `config.json`, and update default config.
+        Args:
+            config_file (str): Path of config file.
+        """
+        with open(config_file, mode="r", encoding="utf-8") as f:
+            cfg_dict = json.load(f)
+        # update libai_cfg by config.json
+        self._update_cfg("img_size", cfg_dict["image_size"])
+        self._update_cfg("patch_size", cfg_dict["patch_size"])
+        self._update_cfg("in_chans", cfg_dict["num_channels"])
+        self._update_cfg("embed_dim", cfg_dict["hidden_size"])
+        self._update_cfg("depth", cfg_dict["num_hidden_layers"])
+        self._update_cfg("num_heads", cfg_dict["num_attention_heads"])
+        self._update_cfg("attn_drop_rate", cfg_dict["attention_probs_dropout_prob"])
+        self._update_cfg("drop_rate", cfg_dict["hidden_dropout_prob"])
+        # update libai_cfg by kwargs
+        for k, v in self.kwargs.items():
+            self._update_cfg(k, v)
+        self._update_cfg_log()
+class ViTLoaderLiBai(ModelLoaderLiBai):
+    def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
+        super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
+        self.base_model_prefix_2 = ""
--- a/libai/models/utils/weight_init.py
+++ b/libai/models/utils/weight_init.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import oneflow.nn as nn
+def init_method_normal(sigma, mean=0.0):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return nn.init.normal_(tensor, mean=mean, std=sigma)
+    return init_
+def scaled_init_method_normal(sigma, num_layers, mean=0.0):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+    def init_(tensor):
+        return nn.init.normal_(tensor, mean=mean, std=std)
+    return init_
--- a/libai/models/vision_transformer.py
+++ b/libai/models/vision_transformer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import oneflow as flow
+import oneflow.nn as nn
+from flowvision.layers.weight_init import trunc_normal_
+import libai.utils.distributed as dist
+from libai.config.config import configurable
+from libai.layers import LayerNorm, Linear, PatchEmbedding, TransformerLayer
+class VisionTransformer(nn.Module):
+    """Vision Transformer in LiBai.
+    LiBai's implementation of:
+    `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale
+    <https://arxiv.org/abs/2010.11929>`_
+    Args:
+        img_size (int, tuple(int)): input image size
+        patch_size (int, tuple(int)): patch size
+        in_chans (int): number of input channels
+        embed_dim (int): embedding dimension
+        depth (int): depth of transformer
+        num_heads (int): number of attention heads
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+        drop_rate (float): dropout rate
+        attn_drop_rate (float): attention dropout rate
+        drop_path_rate (float): stochastic depth rate
+        num_classes (int): number of classes for classification head
+        loss_func (callable, optional): loss function for computing the total loss
+                                        between logits and labels
+    """
+    @configurable
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        mlp_ratio=4.0,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        num_classes=1000,
+        loss_func=None,
+    ):
+        super().__init__()
+        self.img_size = img_size
+        self.num_classes = num_classes
+        self.patch_embed = PatchEmbedding(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        ffn_size = int(embed_dim * mlp_ratio)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(
+            flow.zeros(
+                1,
+                1,
+                embed_dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(0),
+            )
+        )
+        self.pos_embed = nn.Parameter(
+            flow.zeros(
+                1,
+                num_patches + 1,
+                embed_dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(0),
+            )
+        )
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [
+            x.item() for x in flow.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(
+            *[
+                TransformerLayer(
+                    hidden_size=embed_dim,
+                    ffn_hidden_size=ffn_size,
+                    num_attention_heads=num_heads,
+                    attention_dropout_prob=attn_drop_rate,
+                    output_dropout_prob=drop_rate,
+                    drop_path_prob=dpr[i],
+                    layer_idx=i,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = LayerNorm(embed_dim, layer_idx=-1)
+        self.head = Linear(embed_dim, num_classes, layer_idx=-1)
+        # loss func
+        self.loss_func = nn.CrossEntropyLoss() if loss_func is None else loss_func
+        # weight init
+        trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "img_size": cfg.img_size,
+            "patch_size": cfg.patch_size,
+            "in_chans": cfg.in_chans,
+            "embed_dim": cfg.embed_dim,
+            "depth": cfg.depth,
+            "num_heads": cfg.num_heads,
+            "mlp_ratio": cfg.mlp_ratio,
+            "drop_rate": cfg.drop_rate,
+            "attn_drop_rate": cfg.attn_drop_rate,
+            "drop_path_rate": cfg.drop_path_rate,
+            "num_classes": cfg.num_classes,
+            "loss_func": cfg.loss_func,
+        }
+    def forward_features(self, x):
+        # patch embedding
+        x = self.patch_embed(x)
+        cls_token = self.cls_token.expand(
+            x.shape[0], -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        cls_token = cls_token.to_global(sbp=x.sbp, placement=cls_token.placement)
+        x = flow.cat((cls_token, x), dim=1)
+        # position embedding
+        pos_embed = self.pos_embed.expand(x.shape[0], -1, -1)
+        pos_embed = pos_embed.to_global(sbp=x.sbp, placement=pos_embed.placement)
+        x = self.pos_drop(x + pos_embed)
+        # transformer block
+        x = self.blocks(x)
+        return x
+    def forward_head(self, x):
+        x = self.norm(x)
+        outcome = x[:, 0]
+        outcome = self.head(outcome)
+        return outcome
+    def forward(self, images, labels=None):
+        """
+        Args:
+            images (flow.Tensor): training samples.
+            labels (flow.LongTensor, optional): training targets
+        Returns:
+            dict:
+                A dict containing :code:`loss_value` or :code:`logits`
+                depending on training or evaluation mode.
+                :code:`{"losses": loss_value}` when training,
+                :code:`{"prediction_scores": logits}` when evaluating.
+        """
+        x = self.forward_features(images)
+        x = self.forward_head(x)
+        if labels is not None and self.training:
+            losses = self.loss_func(x, labels)
+            return {"losses": losses}
+        else:
+            return {"prediction_scores": x}
+    @staticmethod
+    def set_pipeline_stage_id(model):
+        dist_utils = dist.get_dist_util()
+        # Set pipeline parallelism stage_id
+        if hasattr(model.pos_embed, "config"):
+            # Old API in OneFlow 0.8
+            for module_block in model.modules():
+                if isinstance(module_block.origin, PatchEmbedding):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.origin, TransformerLayer):
+                    module_block.config.set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+            # Set pos_embed and cls_token stage id
+            model.pos_embed.config.set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.cls_token.config.set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.pos_drop.config.set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.norm.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.config.set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+        else:
+            for module_block in model.modules():
+                if isinstance(module_block.to(nn.Module), PatchEmbedding):
+                    module_block.to(flow.nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+                    )
+                elif isinstance(module_block.to(nn.Module), TransformerLayer):
+                    module_block.to(flow.nn.graph.GraphModule).set_stage(
+                        dist_utils.get_layer_stage_id(module_block.layer_idx),
+                        dist.get_layer_placement(module_block.layer_idx),
+                    )
+            # Set pos_embed and cls_token stage id
+            model.pos_embed.to(flow.nn.graph.GraphTensor).set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.cls_token.to(flow.nn.graph.GraphTensor).set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.pos_drop.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
+            )
+            model.norm.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.head.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
+            model.loss_func.to(flow.nn.graph.GraphModule).set_stage(
+                dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
+            )
--- a/libai/onnx_export/gpt2_to_onnx.py
+++ b/libai/onnx_export/gpt2_to_onnx.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import oneflow as flow
+from oneflow import nn
+from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check
+from libai.config import LazyConfig
+from libai.models.utils import GPT2LoaderLiBai
+from projects.MagicPrompt.gpt2 import GPTModel
+def get_model(config_file):
+    cfg = LazyConfig.load(config_file)
+    cfg.model.cfg.pretrained_model_path = None
+    cfg.dataloader = None
+    cfg.tokenization = None
+    print("Building model....")
+    loader = GPT2LoaderLiBai(GPTModel, cfg.cfg, "/path/to/model")
+    model = loader.load()
+    print("Build model finished.")
+    return model
+class gpt2Graph(nn.Graph):
+    def __init__(self, eager_model):
+        super().__init__()
+        self.model = eager_model
+    def build(
+        self,
+        input_ids,
+    ):
+        out = self.model(
+            input_ids,
+        )
+        return out
+if __name__ == "__main__":
+    model = get_model("projects/MagicPrompt/configs/gpt2_inference.py")
+    model.eval()
+    gpt2_graph = gpt2Graph(model)
+    # Build the static graph model
+    input_ids = flow.ones(
+        1, 5, dtype=flow.int64, sbp=flow.sbp.broadcast, placement=flow.placement("cuda", ranks=[0])
+    )
+    # check your model.forward is valid
+    # output = gpt2_graph(
+    #     input_ids
+    # )
+    print("Compiling the graph which may make some time, please wait for a moment....")
+    gpt2_graph._compile(
+        input_ids,
+    )
+    convert_to_onnx_and_check(
+        gpt2_graph,
+        external_data=False,
+        opset=11,
+        flow_weight_dir=None,
+        onnx_model_path="./",
+        dynamic_batch_size=False,
+        device="gpu_global",
+        input_tensor_range=[0, 10],
+    )
--- a/libai/onnx_export/onnx_inference/gpt2_onnx_infer.py
+++ b/libai/onnx_export/onnx_inference/gpt2_onnx_infer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+from typing import List
+import numpy as np
+import onnxruntime as ort
+class OnnxModel:
+    def __init__(
+        self,
+        onnx_filename,
+        providers: List[str] = None,
+        ort_optimize: bool = True,
+    ):
+        ort_sess_opt = ort.SessionOptions()
+        ort_sess_opt.graph_optimization_level = (
+            ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+            if ort_optimize
+            else ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+        )
+        if providers is None:
+            if ort.__version__ > "1.9.0":
+                providers = [
+                    "TensorrtExecutionProvider",
+                    "CUDAExecutionProvider",
+                    "CPUExecutionProvider",
+                ]
+            else:
+                providers = ["CPUExecutionProvider"]
+        self.sess = ort.InferenceSession(
+            onnx_filename, sess_options=ort_sess_opt, providers=providers
+        )
+    def forward(self, input_list):
+        ipt_dict = OrderedDict()
+        for idx, ipt in enumerate(self.sess.get_inputs()):
+            ipt_dict[ipt.name] = input_list[idx]
+        onnx_res = self.sess.run([], ipt_dict)
+        return onnx_res
+if __name__ == "__main__":
+    onnx_model = OnnxModel("model.onnx")
+    input_list = [
+        np.ones((1, 5)).astype(np.int64).astype(np.int64),
+    ]
+    print(onnx_model.forward(input_list))
--- a/libai/onnx_export/onnx_inference/t5_onnx_infer.py
+++ b/libai/onnx_export/onnx_inference/t5_onnx_infer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+from typing import List
+import numpy as np
+import onnxruntime as ort
+class OnnxModel:
+    def __init__(
+        self,
+        onnx_filename,
+        providers: List[str] = None,
+        ort_optimize: bool = True,
+    ):
+        ort_sess_opt = ort.SessionOptions()
+        ort_sess_opt.graph_optimization_level = (
+            ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
+            if ort_optimize
+            else ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+        )
+        if providers is None:
+            if ort.__version__ > "1.9.0":
+                providers = [
+                    "TensorrtExecutionProvider",
+                    "CUDAExecutionProvider",
+                    "CPUExecutionProvider",
+                ]
+            else:
+                providers = ["CPUExecutionProvider"]
+        self.sess = ort.InferenceSession(
+            onnx_filename, sess_options=ort_sess_opt, providers=providers
+        )
+    def forward(self, input_list):
+        ipt_dict = OrderedDict()
+        for idx, ipt in enumerate(self.sess.get_inputs()):
+            ipt_dict[ipt.name] = input_list[idx]
+        onnx_res = self.sess.run([], ipt_dict)
+        return onnx_res
+if __name__ == "__main__":
+    onnx_model = OnnxModel("model.onnx")
+    input_list = [
+        np.ones((1, 5)).astype(np.int64),
+        np.ones((1, 3)).astype(np.int64),
+        np.ones((1, 5, 5)).astype(bool),
+        np.ones((1, 3, 3)).astype(bool),
+        np.ones((1, 3, 5)).astype(bool),
+    ]
+    print(onnx_model.forward(input_list))
--- a/libai/onnx_export/t5_to_onnx.py
+++ b/libai/onnx_export/t5_to_onnx.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import oneflow as flow
+from oneflow import nn
+from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check
+from libai.config import LazyConfig
+from projects.MT5.mt5_model import MT5Model
+from projects.MT5.utils.mt5_loader import T5LoaderHuggerFace
+def get_model(config_file):
+    cfg = LazyConfig.load(config_file)
+    cfg.model.cfg.model_type = "mt5"
+    cfg.model.cfg.pretrained_model_path = None
+    cfg.dataloader = None
+    cfg.tokenization = None
+    print("Building model....")
+    loader = T5LoaderHuggerFace(MT5Model, cfg.model.cfg, "/path/to/model")
+    model = loader.load()
+    print("Build model finished.")
+    return model
+class t5Graph(nn.Graph):
+    def __init__(self, eager_model):
+        super().__init__()
+        self.model = eager_model
+    def build(
+        self,
+        encoder_input_ids,
+        encoder_attn_mask,
+        decoder_input_ids,
+        decoder_attn_mask,
+        encoder_decoder_attn_mask,
+    ):
+        out = self.model(
+            encoder_input_ids,
+            encoder_attn_mask,
+            decoder_input_ids,
+            decoder_attn_mask,
+            encoder_decoder_attn_mask,
+        )
+        return out
+if __name__ == "__main__":
+    model = get_model("projects/MT5/configs/mt5_pretrain.py")
+    model.eval()
+    t5_graph = t5Graph(model)
+    # Build the static graph model
+    encoder_input_ids = flow.ones(
+        1, 5, dtype=flow.int64, sbp=flow.sbp.broadcast, placement=flow.placement("cuda", ranks=[0])
+    )
+    encoder_attn_mask = flow.ones(
+        1, 3, dtype=flow.int64, sbp=flow.sbp.broadcast, placement=flow.placement("cuda", ranks=[0])
+    )
+    decoder_input_ids = flow.ones(
+        1,
+        5,
+        5,
+        dtype=flow.bool,
+        sbp=flow.sbp.broadcast,
+        placement=flow.placement("cuda", ranks=[0]),
+    )
+    decoder_attn_mask = flow.ones(
+        1,
+        3,
+        3,
+        dtype=flow.bool,
+        sbp=flow.sbp.broadcast,
+        placement=flow.placement("cuda", ranks=[0]),
+    )
+    encoder_decoder_attn_mask = flow.ones(
+        1,
+        3,
+        5,
+        dtype=flow.bool,
+        sbp=flow.sbp.broadcast,
+        placement=flow.placement("cuda", ranks=[0]),
+    )
+    # check your model.forward is valid
+    # output = t5_graph(
+    #     encoder_input_ids,
+    #     encoder_attn_mask,
+    #     decoder_input_ids,
+    #     decoder_attn_mask,
+    #     encoder_decoder_attn_mask
+    # )
+    # print(output)
+    print("Compiling the graph which may make some time, please wait for a moment....")
+    t5_graph._compile(
+        encoder_input_ids,
+        encoder_attn_mask,
+        decoder_input_ids,
+        decoder_attn_mask,
+        encoder_decoder_attn_mask,
+    )
+    convert_to_onnx_and_check(
+        t5_graph,
+        external_data=False,
+        opset=11,
+        flow_weight_dir=None,
+        onnx_model_path="./",
+        dynamic_batch_size=False,
+        device="gpu_global",
+        input_tensor_range=[0, 10],
+    )
--- a/libai/optim/__init__.py
+++ b/libai/optim/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .build import build_optimizer, get_default_optimizer_params
--- a/libai/optim/__pycache__/__init__.cpython-39.pyc
+++ b/libai/optim/__pycache__/__init__.cpython-39.pyc
--- a/libai/optim/__pycache__/build.cpython-39.pyc
+++ b/libai/optim/__pycache__/build.cpython-39.pyc
--- a/libai/optim/build.py
+++ b/libai/optim/build.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from collections import defaultdict
+from typing import Any, Dict, List
+import oneflow as flow
+from libai.config import instantiate
+from libai.layers import LayerNorm
+# --------------------------------------------------------
+# References:
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/solver/build.py
+# --------------------------------------------------------
+def build_optimizer(cfg, model):
+    """
+    Build an optimizer from config.
+    """
+    cfg.params.model = model
+    optim = instantiate(cfg)
+    return optim
+def get_default_optimizer_params(
+    model,
+    base_lr=None,
+    weight_decay=None,
+    weight_decay_norm=None,
+    weight_decay_bias=None,
+    clip_grad_max_norm=None,
+    clip_grad_norm_type=None,
+    overrides=None,
+):
+    """
+    Get default param list for optimizer, with suport for a few types of overrides.
+    If no overrides are needed, it is equivalent to `model.parameters()`.
+    Arguments:
+        base_lr: lr for every group by default. Can be omitted to use the one in optimizer.
+        weight_decay: weight decay for every group by default. Can be omitted to use the one
+            in optimizer.
+        weight_decay_norm: override weight decay for params in normalization layers
+        weight_decay_bias: override weight decay for bias parameters
+        overrides: if not `None`, provides values for optimizer hyperparameters
+            (LR, weight decay) for module parameters with a given name; e.g.
+            ``{"embedding": {"lr": 0.01, "weight_decay": 0.1}}`` will set the LR and
+            weight decay values for all module parameters named `embedding`.
+    For common transformer models, ``weight_decay_norm`` and ``weight_decay_bias``
+    are usually set to 0.
+    Example:
+    ::
+        flow.optim.AdamW(
+            get_default_optimizer_params(model, weight_decay_norm=0, weight_decay_bias=0),
+            lr=0.01,
+            weight_decay=1e-4
+        )
+    """
+    if overrides is None:
+        overrides = {}
+    defaults = {}
+    if base_lr is not None:
+        defaults["lr"] = base_lr
+    if weight_decay is not None:
+        defaults["weight_decay"] = weight_decay
+    if clip_grad_max_norm is not None and clip_grad_norm_type is not None:
+        defaults["clip_grad_max_norm"] = clip_grad_max_norm
+        defaults["clip_grad_norm_type"] = clip_grad_norm_type
+    bias_overrides = {}
+    if weight_decay_bias is not None:
+        bias_overrides["weight_decay"] = weight_decay_bias
+    if len(bias_overrides):
+        if "bias" in overrides:
+            raise ValueError("Conflicting overrides for 'bias'")
+        overrides["bias"] = bias_overrides
+    norm_module_types = (
+        LayerNorm,
+        flow.nn.BatchNorm1d,
+        flow.nn.BatchNorm2d,
+        flow.nn.BatchNorm3d,
+        flow.nn.GroupNorm,
+        flow.nn.InstanceNorm1d,
+        flow.nn.InstanceNorm2d,
+        flow.nn.InstanceNorm3d,
+        flow.nn.FusedBatchNorm1d,
+        flow.nn.FusedBatchNorm2d,
+        flow.nn.FusedBatchNorm3d,
+    )
+    params = []
+    memo = set()
+    for module in model.modules():
+        for model_param_name, value in module.named_parameters(recurse=False):
+            if not value.requires_grad:
+                continue
+            # Avoid duplicating parameters
+            if value in memo:
+                continue
+            memo.add(value)
+            hyperparams = copy.copy(defaults)
+            if isinstance(module, norm_module_types) and weight_decay_norm is not None:
+                hyperparams["weight_decay"] = weight_decay_norm
+            hyperparams.update(overrides.get(model_param_name, {}))
+            params.append({"params": [value], **hyperparams})
+    return reduce_param_groups(params)
+def _expand_param_groups(params: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Transform parameter groups into per-parameter structure.
+    Later items in `params` can overwrite parameters set in previous items.
+    """
+    ret = defaultdict(dict)
+    for item in params:
+        assert "params" in item
+        cur_params = {x: y for x, y in item.items() if x != "params"}
+        for param in item["params"]:
+            ret[param].update({"params": [param], **cur_params})
+    return list(ret.values())
+def reduce_param_groups(params: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Reorganize the parameter groups and merge duplicated groups.
+    The number of parameter groups needs to be as small as possible in order
+    to efficiently use the OneFlow multi-tensor optimizer. Therefore instead
+    of using a parameter_group per single parameter, we reorganize the
+    parameter groups and merge duplicated groups. This approach speeds
+    up multi-tensor optimizer significantly.
+    """
+    params = _expand_param_groups(params)
+    groups = defaultdict(list)  # re-group all parameter groups by their hyperparams
+    for item in params:
+        cur_params = tuple((x, y) for x, y in item.items() if x != "params")
+        groups[cur_params].extend(item["params"])
+    ret = []
+    for param_keys, param_values in groups.items():
+        cur = {kv[0]: kv[1] for kv in param_keys}
+        cur["params"] = param_values
+        ret.append(cur)
+    return ret
--- a/libai/scheduler/__init__.py
+++ b/libai/scheduler/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .build import build_lr_scheduler
+from .lr_scheduler import (
+    WarmupCosineAnnealingLR,
+    WarmupCosineLR,
+    WarmupExponentialLR,
+    WarmupMultiStepLR,
+    WarmupPolynomialLR,
+    WarmupStepLR,
+)
--- a/libai/scheduler/__pycache__/__init__.cpython-39.pyc
+++ b/libai/scheduler/__pycache__/__init__.cpython-39.pyc
--- a/libai/scheduler/__pycache__/build.cpython-39.pyc
+++ b/libai/scheduler/__pycache__/build.cpython-39.pyc
--- a/libai/scheduler/__pycache__/lr_scheduler.cpython-39.pyc
+++ b/libai/scheduler/__pycache__/lr_scheduler.cpython-39.pyc
--- a/libai/scheduler/build.py
+++ b/libai/scheduler/build.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from libai.config import instantiate
+def build_lr_scheduler(cfg, optimizer):
+    """Build learning rate scheduler, defined by ``cfg``."""
+    cfg.optimizer = optimizer
+    scheduler = instantiate(cfg)
+    return scheduler
--- a/libai/scheduler/lr_scheduler.py
+++ b/libai/scheduler/lr_scheduler.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import oneflow as flow
+logger = logging.getLogger(__name__)
+def WarmupCosineLR(
+    optimizer: flow.optim.Optimizer,
+    max_iter: int,
+    warmup_factor: float,
+    warmup_iter: int,
+    alpha: float = 0.0,
+    warmup_method: str = "linear",
+):
+    """Create a schedule with a learning rate that decreases following
+    the values of the Cosine function between the initial lr set in the
+    optimizer to 0, after a warmup period during which it increases linearly
+    between 0 and the initial lr set in the optimizer.
+    Args:
+        optimizer (flow.optim.Optimizer): Wrapped optimizer.
+        max_iter (int): Total training iters.
+        warmup_factor (float): The warmup factor.
+        warmup_iter (int): The number of warmup steps.
+        alpha (float, optional): The learning rate scale factor (:math:`\\alpha`). Defaults to 0.0.
+        warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
+            In linear mode, the multiplication factor starts with warmup_factor in
+            the first epoch and then inreases linearly to reach 1. Defaults to "linear".
+    """
+    cosine_decay_lr = flow.optim.lr_scheduler.CosineDecayLR(
+        optimizer, decay_steps=max_iter, alpha=alpha
+    )
+    if warmup_iter == 0:
+        logger.warning("warmup iters equals to zero, return CosineLR")
+        return cosine_decay_lr
+    elif warmup_iter > max_iter:
+        logger.warning("warmup iters is larger than the total training iters")
+    warmup_cosine_lr = flow.optim.lr_scheduler.WarmUpLR(
+        cosine_decay_lr,
+        warmup_factor=warmup_factor,
+        warmup_iters=warmup_iter,
+        warmup_method=warmup_method,
+    )
+    return warmup_cosine_lr
+def WarmupCosineAnnealingLR(
+    optimizer: flow.optim.Optimizer,
+    max_iter: int,
+    warmup_factor: float,
+    warmup_iter: int,
+    eta_min: float = 0.0,
+    warmup_method: str = "linear",
+):
+    """Create a schedule with a learning rate that decreases following
+    the values of the Cosine Annealing function between the initial
+    lr set in the optimizer to 0, after a warmup period during which
+    it increases linearly between 0 and the initial lr set in the optimizer.
+    Args:
+        optimizer (flow.optim.Optimizer): Wrapped optimizer.
+        max_iter (int): Total training iters.
+        warmup_factor (float): The warmup factor.
+        warmup_iter (int): The number of warmup steps.
+        eta_min (float, optional): Minimum learning rate. Defaults to 0.0.
+        warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
+            In linear mode, the multiplication factor starts with warmup_factor in the first epoch
+            and then inreases linearly to reach 1. Defaults to "linear".
+    """
+    cosine_annealing_lr = flow.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=max_iter, eta_min=eta_min
+    )
+    if warmup_iter == 0:
+        logger.warning("warmup iters equals to zero, return CosineAnnealingLR")
+        return cosine_annealing_lr
+    warmup_cosine_annealing_lr = flow.optim.lr_scheduler.WarmUpLR(
+        cosine_annealing_lr,
+        warmup_factor=warmup_factor,
+        warmup_iters=warmup_iter,
+        warmup_method=warmup_method,
+    )
+    return warmup_cosine_annealing_lr
+def WarmupStepLR(
+    optimizer: flow.optim.Optimizer,
+    max_iter: int,
+    warmup_factor: float,
+    warmup_iter: int,
+    step_size: int,
+    gamma: float = 0.1,
+    warmup_method: str = "linear",
+):
+    """Create a schedule with a learning rate that decreases following the values of the Step
+    function between the initial lr set in the optimizer to 0, after a warmup period during which
+    it increases linearly between 0 and the initial lr set in the optimizer.
+    Args:
+        optimizer (flow.optim.Optimizer): Wrapped optimizer.
+        max_iter (int): Total training iters.
+        warmup_factor (float): The warmup factor.
+        warmup_iter (int): The number of warmup steps.
+        step_size (int): Period of learning rate decay.
+        gamma (float, optional): Multiplicative factor of learning rate decay. Defaults to 0.1.
+        warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
+            In linear mode, the multiplication factor starts with warmup_factor in the first
+            epoch and then inreases linearly to reach 1. Defaults to "linear".
+    """
+    step_lr = flow.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
+    if warmup_iter == 0:
+        logger.warning("warmup iters equals to zero, return StepLR")
+        return step_lr
+    warmup_step_lr = flow.optim.lr_scheduler.WarmUpLR(
+        step_lr,
+        warmup_factor=warmup_factor,
+        warmup_iters=warmup_iter,
+        warmup_method=warmup_method,
+    )
+    return warmup_step_lr
+def WarmupMultiStepLR(
+    optimizer: flow.optim.Optimizer,
+    max_iter: int,
+    warmup_factor: float,
+    warmup_iter: int,
+    milestones: list,
+    gamma: float = 0.1,
+    warmup_method: str = "linear",
+):
+    """Create a schedule with a learning rate that decreases following the values of the MultiStep
+    function between the initial lr set in the optimizer to 0, after a warmup period during which
+    it increases linearly between 0 and the initial lr set in the optimizer.
+    Args:
+        optimizer (flow.optim.Optimizer): Wrapped optimizer.
+        max_iter (int): Total training iters.
+        warmup_factor (float): The warmup factor.
+        warmup_iter (int): The number of warmup steps.
+        milestones (list): List of step indices. Must be increasing.
+        gamma (float, optional): Multiplicative factor of learning rate decay. Defaults to 0.1.
+        warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
+            In linear mode, the multiplication factor starts with warmup_factor in the first
+            epoch and then inreases linearly to reach 1. Defaults to "linear".
+    """
+    multistep_lr = flow.optim.lr_scheduler.MultiStepLR(
+        optimizer, milestones=milestones, gamma=gamma
+    )
+    if warmup_iter == 0:
+        logger.warning("warmup iters equals to zero, return MultiStepLR")
+        return multistep_lr
+    warmup_multistep_lr = flow.optim.lr_scheduler.WarmUpLR(
+        multistep_lr,
+        warmup_factor=warmup_factor,
+        warmup_iters=warmup_iter,
+        warmup_method=warmup_method,
+    )
+    return warmup_multistep_lr
+def WarmupExponentialLR(
+    optimizer: flow.optim.Optimizer,
+    max_iter: int,
+    gamma: float,
+    warmup_factor: float,
+    warmup_iter: int,
+    warmup_method: str = "linear",
+):
+    """Create a schedule with a learning rate that decreases following the values of
+    the Exponential function between the initial lr set in the optimizer to 0,
+    after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+    Args:
+        optimizer (flow.optim.Optimizer): Wrapped optimizer.
+        max_iter (int): Total training iters.
+        gamma (float): Multiplicative factor of learning rate decay.
+        warmup_factor (float): The warmup factor.
+        warmup_iter (int): The number of warmup steps.
+        warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
+            In linear mode, the multiplication factor starts with warmup_factor in the first epoch
+            and then inreases linearly to reach 1. Defaults to "linear".
+    """
+    exponential_lr = flow.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)
+    if warmup_iter == 0:
+        logger.warning("warmup iters equals to zero, return ExponentialLR")
+        return exponential_lr
+    warmup_exponential_lr = flow.optim.lr_scheduler.WarmUpLR(
+        exponential_lr,
+        warmup_factor=warmup_factor,
+        warmup_iters=warmup_iter,
+        warmup_method=warmup_method,
+    )
+    return warmup_exponential_lr
+def WarmupPolynomialLR(
+    optimizer: flow.optim.Optimizer,
+    max_iter: int,
+    warmup_factor: float,
+    warmup_iter: int,
+    end_learning_rate: float = 0.0001,
+    power: float = 1.0,
+    cycle: bool = False,
+    warmup_method: str = "linear",
+):
+    """Create a schedule with a learning rate that decreases as a polynomial decay from
+    the initial lr set in the optimizer to end lr defined by `lr_end`,
+    after a warmup period during which it increases linearly from 0 to the
+    initial lr set in the optimizer.
+    Args:
+        optimizer (flow.optim.Optimizer): Wrapped optimizer.
+        max_iter (int): Total training iters.
+        warmup_factor (float): The warmup factor.
+        warmup_iter (int): The number of warmup steps.
+        end_learning_rate (float, optional): The final learning rate. Defaults to 0.0001.
+        power (float, optional): The power of polynomial. Defaults to 1.0.
+        cycle (bool, optional): If cycle is True, the scheduler will decay the learning rate
+            every decay steps. Defaults to False.
+        warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
+            In linear mode, the multiplication factor starts with warmup_factor in the first
+            epoch and then inreases linearly to reach 1. Defaults to "linear".
+    """
+    polynomial_lr = flow.optim.lr_scheduler.PolynomialLR(
+        optimizer,
+        decay_batch=max_iter,
+        end_learning_rate=end_learning_rate,
+        power=power,
+        cycle=cycle,
+    )
+    if warmup_iter == 0:
+        logger.warning("warmup iters equals to zero, return PolynomialLR")
+        return polynomial_lr
+    warmup_polynomial_lr = flow.optim.lr_scheduler.WarmUpLR(
+        polynomial_lr,
+        warmup_factor=warmup_factor,
+        warmup_iters=warmup_iter,
+        warmup_method=warmup_method,
+    )
+    return warmup_polynomial_lr
--- a/libai/tokenizer/__init__.py
+++ b/libai/tokenizer/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .build import build_tokenizer
+from .tokenization_bert import BertTokenizer
+from .tokenization_roberta import RobertaTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_t5 import T5Tokenizer
+from .tokenization_base import PreTrainedTokenizer
--- a/libai/tokenizer/__pycache__/__init__.cpython-39.pyc
+++ b/libai/tokenizer/__pycache__/__init__.cpython-39.pyc
--- a/libai/tokenizer/__pycache__/build.cpython-39.pyc
+++ b/libai/tokenizer/__pycache__/build.cpython-39.pyc