bloom

9fdb7dab · yuguo960516 · 9fdb7dab · 9fdb7dab · 9fdb7dab · 9fdb7dab
Commit 9fdb7dab authored Mar 30, 2023 by yuguo960516
20 changed files
--- a/projects/MOCOV3/configs/models/moco_vit_small_patch16.py
+++ b/projects/MOCOV3/configs/models/moco_vit_small_patch16.py
+from libai.config import LazyCall
+from modeling.moco import MoCo_ViT
+from modeling.vit import VisionTransformer
+
+
+base_encoder = LazyCall(VisionTransformer)(
+    img_size=224,
+    patch_size=16,
+    in_chans=3,
+    embed_dim=384,
+    depth=12,
+    num_heads=12,
+    mlp_ratio=4,
+    drop_path_rate=0.0,
+    global_pool=False,
+    stop_grad_conv1=True,
+)
+
+momentum_encoder = LazyCall(VisionTransformer)(
+    img_size=224,
+    patch_size=16,
+    in_chans=3,
+    embed_dim=384,
+    depth=12,
+    num_heads=12,
+    mlp_ratio=4,
+    drop_path_rate=0.0,
+    global_pool=False,
+    stop_grad_conv1=True,
+)
+
+
+model = LazyCall(MoCo_ViT)(
+    base_encoder=base_encoder,
+    momentum_encoder=momentum_encoder,
+    dim=256,
+    mlp_dim=4096,
+    T=0.2,
+    m=0.99,
+)
--- a/projects/MOCOV3/configs/models/vit_base_patch16.py
+++ b/projects/MOCOV3/configs/models/vit_base_patch16.py
+import sys
+
+sys.path.append("projects/MOCOV3")
+
+from libai.config import LazyCall  # noqa: E402
+from modeling.vit import VisionTransformer  # noqa: E402
+
+
+model = LazyCall(VisionTransformer)(
+    img_size=224,
+    patch_size=16,
+    in_chans=3,
+    embed_dim=768,
+    depth=12,
+    num_heads=12,
+    mlp_ratio=4,
+    drop_path_rate=0.1,
+    global_pool=False,
+)
--- a/projects/MOCOV3/configs/models/vit_small_patch16.py
+++ b/projects/MOCOV3/configs/models/vit_small_patch16.py
+from .vit_base_patch16 import model
+
+
+model.embed_dim = 384
+model.depth = 12
+model.num_heads = 12
+model.drop_path_rate = 0.0
--- a/projects/MOCOV3/modeling/moco.py
+++ b/projects/MOCOV3/modeling/moco.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# --------------------------------------------------------
+# MoCo v3 Model
+# References:
+# moco-v3: https://github.com/facebookresearch/moco-v3/blob/main/moco/builder.py
+# --------------------------------------------------------
+
+
+import math
+
+import oneflow as flow
+import oneflow.nn as nn
+
+from libai.layers import Linear
+from libai.utils.distributed import get_world_size
+
+
+class MoCo(nn.Module):
+    """
+    Build a MoCo model with a base encoder, a momentum encoder, and two MLPs
+    https://arxiv.org/abs/1911.05722
+    """
+
+    def __init__(
+        self, base_encoder, momentum_encoder, dim=256, mlp_dim=4096, T=1.0, m=0.99, max_iter=300
+    ):
+        """
+        dim: feature dimension (default: 256)
+        mlp_dim: hidden dimension in MLPs (default: 4096)
+        T: softmax temperature (default: 1.0)
+        """
+        super(MoCo, self).__init__()
+
+        self.T = T
+        self.m = m
+        # build encoders
+        self.base_encoder = base_encoder
+        self.momentum_encoder = momentum_encoder
+        self.base_encoder.num_classes = dim
+        self.momentum_encoder.num_classes = dim
+        self.max_iter = max_iter
+
+        self._build_projector_and_predictor_mlps(dim, mlp_dim)
+
+        for param_b, param_m in zip(
+            self.base_encoder.parameters(), self.momentum_encoder.parameters()
+        ):
+            param_m.data.copy_(param_b.data)  # initialize
+            param_m.requires_grad = False  # not update by gradient
+
+    def _build_mlp(self, num_layers, input_dim, mlp_dim, output_dim, last_bn=True):
+        mlp = []
+        for l in range(num_layers):
+            dim1 = input_dim if l == 0 else mlp_dim
+            dim2 = output_dim if l == num_layers - 1 else mlp_dim
+
+            mlp.append(Linear(dim1, dim2, bias=False))  # libai
+            if l < num_layers - 1:
+                mlp.append(nn.BatchNorm1d(dim2))
+                mlp.append(nn.ReLU(inplace=True))
+            elif last_bn:
+                # follow SimCLR's design:
+                # https://github.com/google-research/simclr/blob/master/model_util.py#L157
+                # for simplicity, we further removed gamma in BN
+
+                # TODO: affine should be False (bug here)
+                mlp.append(nn.BatchNorm1d(dim2, affine=True))
+
+        return nn.Sequential(*mlp)
+
+    def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
+        pass
+
+    @flow.no_grad()
+    def _update_momentum_encoder(self, m):
+        """Momentum update of the momentum encoder"""
+        for param_b, param_m in zip(
+            self.base_encoder.parameters(), self.momentum_encoder.parameters()
+        ):
+            param_m.data = param_m.data * m + param_b.data * (1.0 - m)
+
+    def contrastive_loss(self, q, k):
+
+        # normalize
+        q = nn.functional.normalize(q, dim=1)
+        k = nn.functional.normalize(k, dim=1)
+
+        # gather all targets
+        # k = concat_all_gather(k).to_global(sbp=q.sbp, placement=q.placement)
+        k = k.to_global(sbp=flow.sbp.broadcast)
+
+        # Einstein sum is more intuitive
+        logits = flow.einsum("nc,mc->nm", q, k) / self.T
+        N = logits.shape[0] // get_world_size()
+        labels = (flow.arange(N, dtype=flow.long) + N * flow.env.get_rank()).to_global(
+            sbp=flow.sbp.split(0), placement=logits.placement
+        )
+
+        return nn.CrossEntropyLoss()(logits, labels) * (2 * self.T)
+
+    def adjust_moco_momentum(self, cu_iter, m):
+        """Adjust moco momentum based on current epoch"""
+        m = 1.0 - 0.5 * (1.0 + math.cos(math.pi * cu_iter / self.max_iter)) * (1.0 - m)
+        return m
+
+    def forward(self, images, labels=None, cu_iter=0, m=0.99):
+
+        if self.training:
+            [x1, x2] = flow.chunk(images, 2, dim=1)
+            # compute features
+            q1 = self.predictor(self.base_encoder(x1)["prediction_scores"])
+            q2 = self.predictor(self.base_encoder(x2)["prediction_scores"])
+
+            m = self.adjust_moco_momentum(cu_iter, m)  # update the moco_momentum
+
+            with flow.no_grad():  # no gradient
+                self._update_momentum_encoder(m)  # update the momentum encoder
+
+                # compute momentum features as targets
+                k1 = self.momentum_encoder(x1)["prediction_scores"]
+                k2 = self.momentum_encoder(x2)["prediction_scores"]
+
+            return (
+                {"losses": self.contrastive_loss(q1, k2) + self.contrastive_loss(q2, k1)},
+                {"m": m},
+            )
+        else:
+            return self.base_encoder(images)
+
+
+class MoCo_ViT(MoCo):
+    def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
+        hidden_dim = self.base_encoder.head.weight.shape[1]
+        # projectors
+        self.base_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim, dim)
+        self.momentum_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim, dim)
+
+        # predictor
+        self.predictor = self._build_mlp(2, dim, mlp_dim, dim)
--- a/projects/MOCOV3/modeling/vit.py
+++ b/projects/MOCOV3/modeling/vit.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# --------------------------------------------------------
+# ViT Model
+# References:
+# moco-v3: https://github.com/facebookresearch/moco-v3/blob/main/vits.py
+# --------------------------------------------------------
+
+
+import math
+from functools import reduce
+from operator import mul
+
+import oneflow as flow
+import oneflow.nn as nn
+from flowvision.layers.weight_init import trunc_normal_
+from utils.load_checkpoint import load_checkpoint
+
+from libai.layers import Linear, PatchEmbedding
+from libai.models import vision_transformer
+
+
+class VisionTransformer(vision_transformer.VisionTransformer):
+    """Vision Transformer for MOCO
+    LiBai impl of: `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        global_pool=False,
+        num_classes=1000,
+        loss_func=None,
+        linear_prob=None,
+        weight_style="pytorch",
+        stop_grad_conv1=False,
+    ):
+        super(VisionTransformer, self).__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            drop_rate=drop_rate,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            num_classes=num_classes,
+            loss_func=loss_func,
+        )
+        self.global_pool = global_pool
+
+        # weight init
+        if linear_prob:
+            load_checkpoint(self, linear_prob, weight_style, num_heads, embed_dim)
+            self.head.weight.data.normal_(mean=0.0, std=0.01)
+            self.head.bias.data.zeros_()
+        else:
+            trunc_normal_(self.pos_embed, std=0.02)
+            trunc_normal_(self.cls_token, std=0.02)
+            self.apply(self._init_weights)
+
+            self.stop_grad_conv1 = stop_grad_conv1
+            self.embed_dim = embed_dim
+            self.initialization()
+
+    def initialization(self):
+
+        # Use fixed 2D sin-cos position embedding
+        self.build_2d_sincos_position_embedding()
+
+        # weight initialization
+        for name, m in self.named_modules():
+            if isinstance(m, Linear):
+                if "query_key_value" in name:
+                    val = math.sqrt(6.0 / float(m.weight.shape[0] // 3 + m.weight.shape[1]))
+                    nn.init.uniform_(m.weight, -val, val)
+                else:
+                    nn.init.xavier_uniform_(m.weight)
+
+                nn.init.zeros_(m.bias)
+        nn.init.normal_(self.cls_token, std=1e-6)
+
+        if isinstance(self.patch_embed, PatchEmbedding):
+            # xavier_uniform initialization
+            val = math.sqrt(
+                6.0 / float(3 * reduce(mul, self.patch_embed.patch_size, 1) + self.embed_dim)
+            )
+            nn.init.uniform_(self.patch_embed.proj.weight, -val, val)
+            nn.init.zeros_(self.patch_embed.proj.bias)
+
+            if self.stop_grad_conv1:
+                self.patch_embed.proj.weight.requires_grad = False
+                self.patch_embed.proj.bias.requires_grad = False
+
+    def build_2d_sincos_position_embedding(self, temperature=10000.0):
+        sbp = self.pos_embed.sbp
+        placement = self.pos_embed.placement
+
+        h, w = self.patch_embed.grid_size
+        grid_w = flow.arange(w, dtype=flow.float32).to_global(sbp=sbp, placement=placement)
+        grid_h = flow.arange(h, dtype=flow.float32).to_global(sbp=sbp, placement=placement)
+        grid_w, grid_h = flow.meshgrid(grid_w, grid_h)
+        assert (
+            self.embed_dim % 4 == 0
+        ), "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
+        pos_dim = self.embed_dim // 4
+        omega = (flow.arange(pos_dim, dtype=flow.float32) / pos_dim).to_global(
+            sbp=sbp, placement=placement
+        )
+        omega = 1.0 / flow.tensor(temperature).to_global(sbp=sbp, placement=placement) ** omega
+        out_w = flow.einsum("m,d->md", grid_w.flatten(), omega)
+        out_h = flow.einsum("m,d->md", grid_h.flatten(), omega)
+        pos_emb = flow.cat(
+            [flow.sin(out_w), flow.cos(out_w), flow.sin(out_h), flow.cos(out_h)], dim=1
+        )[None, :, :]
+        pe_token = flow.zeros([1, 1, self.embed_dim], dtype=flow.float32).to_global(
+            sbp=sbp, placement=placement
+        )
+        self.pos_embed = nn.Parameter(flow.cat([pe_token, pos_emb], dim=1))
+        self.pos_embed.requires_grad = False
+
+    def forward_head(self, x):
+        if self.global_pool:
+            x = x[:, 1:, :].mean(dim=1)  # global pool without cls token
+            outcome = self.norm(x)
+            outcome = self.head(outcome)
+        else:
+            x = self.norm(x)
+            outcome = x[:, 0]
+            outcome = self.head(outcome)
+        return outcome
--- a/projects/MOCOV3/pretrain_net.py
+++ b/projects/MOCOV3/pretrain_net.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import sys
+
+from trainer.moco_trainer import MoCoEagerTrainer
+
+from libai.config import LazyConfig, default_argument_parser, try_get_key
+from libai.engine import DefaultTrainer, default_setup
+from libai.utils.checkpoint import Checkpointer
+
+sys.path.append(".")
+logger = logging.getLogger(__name__)
+
+
+class MoCoPretrainingTrainer(DefaultTrainer):
+    def __init__(self, cfg):
+
+        super().__init__(cfg)
+
+        self.model.max_iter = cfg.train.train_iter
+
+        self._trainer = MoCoEagerTrainer(
+            self.model, self.train_loader, self.optimizer, cfg.train.num_accumulation_steps
+        )
+
+
+def main(args):
+    cfg = LazyConfig.load(args.config_file)
+    cfg = LazyConfig.apply_overrides(cfg, args.opts)
+
+    if try_get_key(cfg, "graph.enabled") is True:
+        raise NotImplementedError(
+            "LiBai MOCO only support eager global mode now, please set cfg.graph.enabled=False"
+        )
+
+    default_setup(cfg, args)
+
+    if args.fast_dev_run:
+        cfg.train.train_epoch = 0
+        cfg.train.train_iter = 20
+        cfg.train.eval_period = 10
+        cfg.train.log_period = 1
+
+    if args.eval_only:
+        tokenizer = None
+        if try_get_key(cfg, "tokenization.setup", default=False):
+            tokenizer = MoCoPretrainingTrainer.build_tokenizer(cfg)
+        model = MoCoPretrainingTrainer.build_model(cfg)
+        Checkpointer(model, save_dir=cfg.train.output_dir).resume_or_load(
+            cfg.train.load_weight, resume=args.resume
+        )
+        if try_get_key(cfg, "train.graph.enabled", default=False):
+            model = MoCoPretrainingTrainer.build_graph(cfg, model, is_train=False)
+        test_loader = MoCoPretrainingTrainer.build_test_loader(cfg, tokenizer)
+        _ = MoCoPretrainingTrainer.test(cfg, test_loader, model)
+        return
+
+    trainer = MoCoPretrainingTrainer(cfg)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    main(args)
--- a/projects/MOCOV3/trainer/moco_trainer.py
+++ b/projects/MOCOV3/trainer/moco_trainer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import time
+from typing import Callable
+
+from libai.engine.trainer import EagerTrainer
+
+
+class MoCoEagerTrainer(EagerTrainer):
+    def run_step(self, get_batch: Callable):
+
+        assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
+        start = time.perf_counter()
+
+        # If you want to do something with the data, you can wrap the dataloader.
+        data = next(self._data_loader_iter)
+        data = get_batch(data, getattr(self.data_loader, "mixup_func", None))
+        data_time = time.perf_counter() - start
+
+        # update the moco_momentum per step
+        loss_dict, m_dict = self.model(**data, cu_iter=self.iter, m=self.model.m)
+        self.model.m = m_dict["m"]
+        losses = sum(loss_dict.values()) / self.grad_acc_steps
+        losses.backward()
+        self.write_metrics(loss_dict, data_time)
+
+        if (self.iter + 1) % self.grad_acc_steps == 0:
+            self.optimizer.step()
+            self.optimizer.zero_grad()
--- a/projects/MOCOV3/transform/linear_prob_transform.py
+++ b/projects/MOCOV3/transform/linear_prob_transform.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from flowvision import transforms
+from flowvision.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+from libai.config import LazyCall
+
+train_aug = [
+    LazyCall(transforms.RandomResizedCrop)(size=224),
+    LazyCall(transforms.RandomHorizontalFlip)(),
+    LazyCall(transforms.ToTensor)(),
+    LazyCall(transforms.Normalize)(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+]
--- a/projects/MOCOV3/transform/pretrain_transform.py
+++ b/projects/MOCOV3/transform/pretrain_transform.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import random
+
+import oneflow as flow
+from flowvision import transforms
+from flowvision.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from PIL import ImageFilter, ImageOps
+
+from libai.config import LazyCall
+
+
+class GaussianBlur(object):
+    """Gaussian blur augmentation from SimCLR: https://arxiv.org/abs/2002.05709"""
+
+    def __init__(self, sigma=[0.1, 2.0]):
+        self.sigma = sigma
+
+    def __call__(self, x):
+        sigma = random.uniform(self.sigma[0], self.sigma[1])
+        x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
+        return x
+
+
+class Solarize(object):
+    """Solarize augmentation from BYOL: https://arxiv.org/abs/2006.07733"""
+
+    def __call__(self, x):
+        return ImageOps.solarize(x)
+
+
+# follow BYOL's augmentation recipe: https://arxiv.org/abs/2006.07733
+augmentation1 = [
+    LazyCall(transforms.RandomResizedCrop)(size=224, scale=(0.2, 1.0)),
+    LazyCall(transforms.RandomApply)(
+        transforms=[
+            LazyCall(transforms.ColorJitter)(
+                brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1
+            )  # not strengthened
+        ],
+        p=0.8,
+    ),
+    # TODO: Add RandomGrayscale
+    # LazyCall(transforms.RandomGrayscale)(p=0.2),
+    LazyCall(transforms.RandomApply)(transforms=[LazyCall(GaussianBlur)(sigma=[0.1, 2.0])], p=1.0),
+    LazyCall(transforms.RandomHorizontalFlip)(),
+    LazyCall(transforms.ToTensor)(),
+    LazyCall(transforms.Normalize)(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+]
+
+augmentation2 = [
+    LazyCall(transforms.RandomResizedCrop)(size=224, scale=(0.2, 1.0)),
+    LazyCall(transforms.RandomApply)(
+        transforms=[
+            LazyCall(transforms.ColorJitter)(
+                brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1
+            )  # not strengthened
+        ],
+        p=0.8,
+    ),
+    # TODO: Add RandomGrayscale
+    # LazyCall(transforms.RandomGrayscale)(p=0.2),
+    LazyCall(transforms.RandomApply)(transforms=[LazyCall(GaussianBlur)(sigma=[0.1, 2.0])], p=1.0),
+    LazyCall(transforms.RandomApply)(transforms=[LazyCall(Solarize)()], p=0.2),
+    LazyCall(transforms.RandomHorizontalFlip)(),
+    LazyCall(transforms.ToTensor)(),
+    LazyCall(transforms.Normalize)(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+]
+
+
+class TwoCropsTransform:
+    """Take two random crops of one image"""
+
+    def __init__(self, base_transform1, base_transform2):
+        self.base_transform1 = base_transform1
+        self.base_transform2 = base_transform2
+
+    def __call__(self, x):
+        im1 = self.base_transform1(x)
+        im2 = self.base_transform2(x)
+        return flow.cat((im1, im2), dim=0)
--- a/projects/MOCOV3/utils/load_checkpoint.py
+++ b/projects/MOCOV3/utils/load_checkpoint.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+
+from utils.weight_convert import load_torch_checkpoint_linear_prob
+
+from libai.utils.checkpoint import (
+    Checkpointer,
+    get_missing_parameters_message,
+    get_unexpected_parameters_message,
+)
+
+logger = logging.getLogger("libai." + __name__)
+
+
+def load_checkpoint(model, path, weight_style, num_heads, embed_dim):
+    linear_keyword = "head"
+    for name, param in model.named_parameters():
+        if name not in ["%s.weight" % linear_keyword, "%s.bias" % linear_keyword]:
+            param.requires_grad = False
+    assert weight_style in ["pytorch", "oneflow"]
+    if weight_style == "pytorch":
+        params = load_torch_checkpoint_linear_prob(num_heads, embed_dim, path=path)
+    else:
+        params = Checkpointer(model).load(path)
+
+    model_state_dict = model.state_dict()
+
+    # check the incorrect shape and unexpected keys
+    incorrect_shapes = []
+    unexpected_keys = []
+    for k in list(params.keys()):
+        if k in model_state_dict:
+            shape_model = tuple(model_state_dict[k].shape)
+            shape_ckp = tuple(params[k].shape)
+            if shape_model != shape_ckp:
+                incorrect_shapes.append((k, shape_ckp, shape_model))
+                params.pop(k)
+            model_state_dict.pop(k)
+        else:
+            unexpected_keys.append(k)
+
+    missing_keys = list(model_state_dict.keys())
+
+    for k, shape_checkpoint, shape_model in incorrect_shapes:
+        logger.warning(
+            "Skip loading parameter '{}' to the model due to incompatible "
+            "shapes: {} in the checkpoint but {} in the "
+            "model! You might want to double check if this is expected.".format(
+                k, shape_checkpoint, shape_model
+            )
+        )
+    if missing_keys:
+        logger.info(get_missing_parameters_message(missing_keys))
+    if unexpected_keys:
+        logger.info(get_unexpected_parameters_message(unexpected_keys))
+
+    model.load_state_dict(params, strict=False)
--- a/projects/MOCOV3/utils/weight_convert.py
+++ b/projects/MOCOV3/utils/weight_convert.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+
+import oneflow as flow
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def convert_qkv_weight(value, num_heads, hidden_size):
+    """
+    convert qkv.weight to be compatible with LiBai transformer layer
+
+    Args:
+        cfg: config file
+        value: qkv.weight in the loaded checkpoint
+    """
+
+    head_size = int(hidden_size / num_heads)
+    qkv_weight = (
+        value.view(3, num_heads, head_size, hidden_size)
+        .permute(1, 0, 2, 3)
+        .contiguous()
+        .view(hidden_size * 3, hidden_size)
+    )
+
+    return qkv_weight
+
+
+def convert_qkv_bias(value, num_heads, hidden_size):
+    """
+    convert qkv.bias to be compatible with LiBai transformer layer
+
+    Args:
+        cfg: config file
+        value: qkv.bias in the loaded checkpoint
+    """
+
+    head_size = int(hidden_size / num_heads)
+    qkv_bias = (
+        value.view(3, num_heads, head_size).permute(1, 0, 2).contiguous().view(hidden_size * 3)
+    )
+
+    return qkv_bias
+
+
+def filter_keys(key, value, num_heads, hidden_size):
+    """Filtering the state_dict keys and values to match LiBai's MOCOV3 model"""
+    if "norm1" in key:
+        key = key.replace("norm1", "input_layernorm")
+    elif "attn.qkv" in key:
+        key = key.replace("attn.qkv", "self_attention.query_key_value")
+        if "weight" in key:
+            value = convert_qkv_weight(value, num_heads, hidden_size)
+        if "bias" in key:
+            value = convert_qkv_bias(value, num_heads, hidden_size)
+    elif "attn.proj" in key:
+        key = key.replace("attn.proj", "self_attention.dense")
+    elif "norm2" in key:
+        key = key.replace("norm2", "post_attention_layernorm")
+    elif "mlp.fc1" in key:
+        key = key.replace("mlp.fc1", "mlp.dense_h_to_4h")
+    elif "mlp.fc2" in key:
+        key = key.replace("mlp.fc2", "mlp.dense_4h_to_h")
+    elif "fc_norm" in key:
+        key = key.replace("fc_norm", "norm")
+
+    return key, value
+
+
+def load_torch_checkpoint_linear_prob(
+    num_heads, hidden_size, path="projects/MOCOV3/output/vit-b-300ep.pth.tar", linear_keyword="head"
+):
+    """Load checkpoint from the given torch weights.
+    Torch weight from: xxx
+    """
+    torch_dict = torch.load(path, map_location="cpu")["state_dict"]
+    parameters = torch_dict
+    new_parameters = dict()
+    for key, value in parameters.items():
+        if "num_batches_tracked" not in key:
+            if key.startswith("module.base_encoder") and not key.startswith(
+                "module.base_encoder.%s" % linear_keyword
+            ):
+                # to global tensor
+                key, val = filter_keys(key, value, num_heads, hidden_size)
+                val = val.detach().cpu().numpy()
+                val = flow.tensor(val).to_global(
+                    sbp=flow.sbp.broadcast, placement=flow.placement("cuda", {0: range(1)})
+                )
+                new_parameters[key[len("module.base_encoder.") :]] = val
+    return new_parameters
--- a/projects/MT5/configs/mt5_base.py
+++ b/projects/MT5/configs/mt5_base.py
+from omegaconf import DictConfig
+from libai.config import LazyCall
+from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining
+
+
+cfg = dict(
+    vocab_size=250112,
+    hidden_size=768,
+    hidden_layers=12,
+    num_attention_heads=12,
+    head_size=64,
+    intermediate_size=2048,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    embedding_dropout_prob=0.1,
+    relative_attention_num_buckets=32,
+    initializer_range=1.0,
+    layernorm_eps=1e-06,
+    amp_enabled=False,
+    model_type="mt5",
+    eos_token_id=1,
+    padding_idx=0,
+    is_encoder_decoder=True,
+    tie_word_embeddings=True,
+)
+
+cfg = DictConfig(cfg)
+
+mt5_model = LazyCall(MT5Model)(cfg=cfg)
+pretrain_model = LazyCall(MT5ForPreTraining)(cfg=cfg)
--- a/projects/MT5/configs/mt5_large.py
+++ b/projects/MT5/configs/mt5_large.py
+from omegaconf import DictConfig
+from libai.config import LazyCall
+from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining
+
+
+cfg = dict(
+    vocab_size=250112,
+    hidden_size=1024,
+    hidden_layers=24,
+    num_attention_heads=16,
+    head_size=64,
+    intermediate_size=2816,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    embedding_dropout_prob=0.1,
+    relative_attention_num_buckets=32,
+    initializer_range=1.0,
+    layernorm_eps=1e-06,
+    amp_enabled=False,
+    model_type="mt5",
+    eos_token_id=1,
+    padding_idx=0,
+    is_encoder_decoder=True,
+    tie_word_embeddings=False,
+)
+
+cfg = DictConfig(cfg)
+
+mt5_model = LazyCall(MT5Model)(cfg=cfg)
+pretrain_model = LazyCall(MT5ForPreTraining)(cfg=cfg)
--- a/projects/MT5/configs/mt5_pretrain.py
+++ b/projects/MT5/configs/mt5_pretrain.py
+from libai.config import LazyCall
+from libai.evaluation import PPLEvaluator
+from libai.scheduler import WarmupExponentialLR
+from configs.common.train import train
+from configs.common.data.t5_dataset import dataloader, tokenization
+from configs.common.models.graph import graph
+from configs.common.optim import optim
+from projects.MT5.configs.mt5_base import pretrain_model as model
+
+
+vocab_file = "./data_test/bert_data/bert-base-chinese-vocab.txt"
+data_prefix = "./data_test/bert_data/loss_compara_content_sentence"
+
+tokenization.tokenizer.vocab_file = vocab_file
+dataloader.train.dataset[0].data_prefix = data_prefix
+dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix
+
+# model config
+model.cfg.hidden_size = 768
+model.cfg.hidden_layers = 12
+model.cfg.num_attention_heads = 12
+model.cfg.head_size = 64
+model.cfg.intermediate_size = 2048
+model.cfg.model_type = "mt5"
+model.cfg.hidden_dropout_prob = 0.0
+model.cfg.attention_probs_dropout_prob = 0.0
+model.cfg.embedding_dropout_prob = 0.0
+model.cfg.vocab_size = 30522
+model.cfg.padding_idx = 0
+model.cfg.tie_word_embeddings = False
+model.cfg.is_encoder_decoder = False
+model.cfg.amp_enabled = True
+model.cfg.initializer_range = 0.02
+model.cfg.pretrained_model_path = None
+
+train.update(
+    dict(
+        output_dir="projects/MT5/output/mt5_output",
+        train_micro_batch_size=4,
+        train_epoch=1,
+        train_iter=24000,
+        log_period=10,
+        amp=dict(enabled=True),
+        warmup_ratio=1 / 24,
+        # checkpointer=dict(period=10, max_to_keep=20),
+        input_placement_device="cpu",
+        dist=dict(
+            data_parallel_size=2,
+            tensor_parallel_size=2,
+            pipeline_parallel_size=1,
+            pipeline_num_layers=2 * model.cfg.hidden_layers,
+        ),
+        scheduler=LazyCall(WarmupExponentialLR)(
+            warmup_factor=0.001,
+            gamma=1.0,
+            warmup_method="linear",
+            warmup_iter=0.0,
+        ),
+        evaluation=dict(
+            evaluator=LazyCall(PPLEvaluator)(),
+            enabled=True,
+            eval_iter=1e5,
+            eval_period=5000,
+        ),
+    )
+)
+
+train.zero_optimization.enabled = True
+train.zero_optimization.stage = 2
+train.activation_checkpoint.enabled = False
+train.num_accumulation_steps = 8
--- a/projects/MT5/configs/mt5_small.py
+++ b/projects/MT5/configs/mt5_small.py
+from omegaconf import DictConfig
+from libai.config import LazyCall
+from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining
+
+
+cfg = dict(
+    vocab_size=250112,
+    hidden_size=512,
+    hidden_layers=8,
+    num_attention_heads=6,
+    head_size=64,
+    intermediate_size=1024,
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    embedding_dropout_prob=0.1,
+    relative_attention_num_buckets=32,
+    initializer_range=1.0,
+    layernorm_eps=1e-06,
+    amp_enabled=False,
+    model_type="mt5",
+    eos_token_id=1,
+    padding_idx=0,
+    is_encoder_decoder=True,
+    tie_word_embeddings=False,
+)
+
+cfg = DictConfig(cfg)
+
+mt5_model = LazyCall(MT5Model)(cfg=cfg)
+pretrain_model = LazyCall(MT5ForPreTraining)(cfg=cfg)
--- a/projects/MT5/configs/t5_inference.py
+++ b/projects/MT5/configs/t5_inference.py
+from .mt5_base import cfg
+from libai.config import LazyCall
+from libai.tokenizer import T5Tokenizer
+from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining
+from configs.common.train import train
+from configs.common.data.t5_dataset import tokenization
+
+cfg.update(
+    model_type="t5",
+    is_encoder_decoder=True,
+    max_length=20,
+    min_length=0,
+    do_sample=False,
+    early_stopping=False,
+    num_beams=1,
+    num_beam_groups=1,
+    diversity_penalty=0.0,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    typical_p=1.0,
+    repetition_penalty=1.0,
+    length_penalty=1.0,
+    no_repeat_ngram_size=0,
+    encoder_no_repeat_ngram_size=0,
+    num_return_sequences=1,
+    chunk_size_feed_forward=0,
+    output_scores=False,
+    forced_bos_token_id=None,
+    forced_eos_token_id=None,
+    remove_invalid_values=False,
+    exponential_decay_length_penalty=None,
+    use_cache=True,
+    # Tokenizer
+    pad_token_id=0,
+    eos_token_id=1,
+    bos_token_id=None,
+    sep_token_id=None,
+    decoder_start_token_id=0,
+)
+
+model = LazyCall(MT5Model)(cfg=cfg)
+tokenization.tokenizer = LazyCall(T5Tokenizer)(
+    vocab_file="/path/to/spiece.model",
+    add_bos_token=True,
+)
--- a/projects/MT5/layers/attention_layer.py
+++ b/projects/MT5/layers/attention_layer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.layers.linear import Linear
+from libai.utils import distributed as dist
+from projects.MT5.layers.embed_layer import Embedding
+
+
+class MultiheadAttention(nn.Module):
+    """Multi-head attention layer, support self attention and cross attention.
+
+    Args:
+        hidden_size: size of hidden state.
+        num_attention_heads: number of attention heads.
+        is_cross_attention: used to specify whether it is self attention or cross attention.
+            Defaults to False.
+        attention_dropout_prob: dropout probability of attention weights.
+            Defaults to 0.0.
+        output_dropout_prob: dropout probability of output. Defaults to 0.0.
+        init_method: method to initialize the input layer weights.
+            Defaults to ``init.xavier_normal_``.
+        output_layer_init_method: method to initialize the output layer weights.
+            If None, use ``init_method``.
+        layer_idx: a layer_idx sign which determines the placements.
+            It will be used in pipeline parallelism. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        num_attention_heads,
+        head_size,
+        relative_attention_num_buckets,
+        is_cross_attention=False,
+        attention_dropout_prob=0.0,
+        output_dropout_prob=0.0,
+        init_method=nn.init.xavier_normal_,
+        output_layer_init_method=None,
+        padding_idx=None,
+        *,
+        layer_idx=0,
+        has_relative_attention_bias=False,
+        is_decoder=False,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.is_decoder = is_decoder
+        self.attention_dropout_prob = attention_dropout_prob
+
+        if output_layer_init_method is None:
+            output_layer_init_method = init_method
+        self.num_heads = num_attention_heads
+        self.head_size = head_size
+
+        self.dropout = nn.Dropout(p=attention_dropout_prob)
+        self.norm_factor = 1.0 / math.sqrt(float(self.head_size))
+
+        self.is_cross_attention = is_cross_attention
+
+        self.output_dropout = nn.Dropout(p=output_dropout_prob)
+
+        if self.is_cross_attention:
+            self.query = Linear(
+                self.hidden_size,
+                self.num_heads * self.head_size,
+                bias=False,
+                parallel="col",
+                init_method=init_method,
+                layer_idx=layer_idx,
+            )
+            self.key_value = Linear(
+                self.hidden_size,
+                self.num_heads * self.head_size * 2,
+                bias=False,
+                parallel="col",
+                init_method=init_method,
+                layer_idx=layer_idx,
+            )
+        else:
+            self.query_key_value = Linear(
+                self.hidden_size,
+                self.num_heads * self.head_size * 3,
+                bias=False,
+                parallel="col",
+                init_method=init_method,
+                layer_idx=layer_idx,
+            )
+
+        self.dense = Linear(
+            self.num_heads * self.head_size,
+            self.hidden_size,
+            bias=False,
+            parallel="row",
+            init_method=output_layer_init_method,
+            skip_bias_add=False,
+            layer_idx=layer_idx,
+        )
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = Embedding(
+                self.relative_attention_num_buckets,
+                self.num_heads,
+                padding_idx=padding_idx,
+                layer_idx=layer_idx,
+            )
+
+    def forward(
+        self,
+        hidden_states: flow.Tensor,
+        encoder_states: flow.Tensor = None,
+        attention_mask: flow.Tensor = None,
+        past_key_value: Tuple[flow.Tensor, flow.Tensor] = None,
+        use_cache: bool = False,
+        position_bias=None,
+        query_length=None,
+    ):
+        """
+
+        Args:
+            hidden_states (flow.Tensor): shape is [bsz, tgt_len, hidden_size].
+            encoder_states (flow.Tensor, optional): shape is [bsz, src_len, hidden_size].
+                Defaults to None.
+            attention_mask (flow.Tensor, optional): shape is [bsz, 1, tgt_len, src_len].
+                It should be the combination of padding mask and casual mask.
+                It is the padding mask of source input when used with self-attention in encoder.
+                And it is the combination of padding mask of target input and casual mask when
+                used with self-attention in decoder. It is the padding mask of source input when
+                used with cross-attention in decoder.
+                Defaults to None.
+            past_key_value (Tuple[flow.Tensor, flow.Tensor], optional): tuple of key and value,
+                each shape is [bsz, num_heads, src_len, head_size]. Defaults to None.
+            use_cache (bool, optional): it will be set to True, when the model is in the inference
+                phase and used for incremental decoding. Defaults to False.
+        """
+
+        if encoder_states is not None:
+            encoder_states = encoder_states.to_global(placement=hidden_states.placement)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.to_global(placement=hidden_states.placement)
+
+        # hidden_states shape: [seq_len, batch_size, hidden_size]
+        real_seq_length, bsz = hidden_states.size()[:2]
+
+        if past_key_value is not None:
+            assert (
+                len(past_key_value) == 2
+            ), "past_key_value should have 2 past states: keys and values."
+            f"Got {len(past_key_value)} past states.\n"
+            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if encoder_states is None else encoder_states.shape[0]
+
+        if self.is_cross_attention:
+            query = self.query(hidden_states)
+            query = query.view(-1, bsz, self.num_heads, self.head_size)
+            query = query.permute(1, 2, 0, 3)  # bsz, num_head, seq_len, head_size
+
+            if past_key_value is not None:
+                key, value = past_key_value
+            elif encoder_states is not None:
+                key_value = self.key_value(encoder_states)
+                key_value = key_value.view(-1, bsz, self.num_heads, 2 * self.head_size)
+                key_value = key_value.permute(1, 2, 0, 3)
+                key, value = flow.chunk(key_value, chunks=2, dim=-1)
+            else:
+                raise ValueError(
+                    "past_key_value and encoder_states cannot be None at the same time."
+                )
+        else:
+            query_key_value = self.query_key_value(hidden_states)
+            if use_cache:
+                query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size)
+                query_key_value = query_key_value.permute(
+                    0, 2, 1, 3
+                )  # [bsz, num_heads, src_len, 3 * head_size]
+                query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1)
+            else:
+                attention_scores, value = flow._C.fused_self_attention(
+                    query_key_value, head_size=self.head_size, alpha=1
+                )
+            if past_key_value is not None:
+                past_key, past_value = past_key_value
+                key = flow.cat((past_key.type_as(key), key), dim=2)
+                value = flow.cat((past_value.type_as(value), value), dim=2)
+
+        if use_cache:
+            past_key_value = (key, value)
+
+        if self.is_cross_attention or use_cache:
+            attention_scores = flow.matmul(query, key, transpose_b=True, alpha=1)
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = flow.zeros(
+                    (1, self.num_heads, real_seq_length, key_length),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                    placement=attention_scores.placement,
+                )
+            else:
+                position_bias = self.compute_bias(
+                    real_seq_length, key_length, placement=attention_mask.placement
+                )
+
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+
+        if attention_mask is not None:
+            if use_cache:
+                attention_mask = attention_mask.expand_as(attention_scores)
+
+            attention_weights = flow._C.fused_bias_add_scale_mask_softmax_dropout(
+                attention_scores,
+                position_bias,
+                attention_mask,
+                fill_value=-10000.0,
+                scale=1,
+                p=self.attention_dropout_prob,
+            )[0]
+        else:
+            attention_scores = attention_scores + position_bias
+            attention_weights = flow.softmax(attention_scores, dim=-1)
+            attention_weights = self.dropout(attention_weights)
+
+        context = flow.matmul(attention_weights, value)
+
+        """ transpose [batch_size, num_head, seq_len, head_size] to
+            [seq_len, batch_size, num_head, head_size]
+        """
+        context = flow._C.transpose(context, perm=(2, 0, 1, 3))
+
+        output = self.dense(context.flatten(2))
+
+        output = self.output_dropout(output)
+
+        if use_cache:
+            output = (output, past_key_value)
+
+        output = (output,) + (position_bias,)
+        return output
+
+    def extra_repr(self) -> str:
+        return "hidden_size={}, num_heads={}, is_cross_attention={}".format(
+            self.hidden_size,
+            self.num_heads,
+            self.is_cross_attention,
+        )
+
+    def _relative_position_bucket(
+        self, relative_position, bidirectional=True, num_buckets=32, max_distance=128
+    ):
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets = (
+                relative_buckets + (relative_position > 0).to(flow.long) * num_buckets
+            )
+            relative_position = flow.abs(relative_position)
+        else:
+            relative_position = (
+                -1
+                * flow.min(
+                    relative_position,
+                    flow.zeros(
+                        relative_position.size(),
+                        sbp=relative_position.sbp,
+                        placement=relative_position.placement,
+                    ),
+                ).to(flow.long)
+            )
+
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        relative_postion_if_large = max_exact + (
+            flow.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(flow.long)
+
+        relative_postion_if_large = flow.min(
+            relative_postion_if_large,
+            flow.zeros(
+                relative_postion_if_large.size(),
+                dtype=relative_postion_if_large.dtype,
+                sbp=relative_postion_if_large.sbp,
+                placement=relative_postion_if_large.placement,
+            ).fill_(num_buckets - 1),
+        )
+
+        relative_buckets = relative_buckets + flow.where(
+            is_small, relative_position, relative_postion_if_large
+        )
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, placement=None):
+        """Compute binned relative position bias"""
+        context_position = flow.arange(
+            query_length,
+            dtype=flow.long,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=placement,
+        )
+        memory_position = flow.arange(
+            key_length,
+            dtype=flow.long,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=placement,
+        )
+        relative_position = (
+            memory_position[None, :] - context_position[:, None]
+        )  # shape (query_length, key_length)
+
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+        )  # shape (query_length, key_length)
+
+        values = self.relative_attention_bias(
+            relative_position_bucket
+        )  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(
+            0
+        )  # shape (1, num_heads, query_length, key_length)
+        return values
--- a/projects/MT5/layers/embed_layer.py
+++ b/projects/MT5/layers/embed_layer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+import oneflow.nn as nn
+from oneflow.nn import init
+
+import libai.utils.distributed as dist
+from libai.layers.embedding import VocabEmbedding
+
+
+class MT5Embedding(flow.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        vocab_size,
+        embedding_dropout_prob,
+        pad_token_id=0,
+        init_method=flow.nn.init.xavier_normal_,
+        amp_enabled=False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+
+        self.word_embeddings = VocabEmbedding(
+            num_embeddings=vocab_size,
+            embedding_dim=hidden_size,
+            init_method=init_method,
+            amp_enabled=amp_enabled,
+            padding_idx=pad_token_id,
+        )
+
+        self.embedding_dropout = flow.nn.Dropout(embedding_dropout_prob)
+
+    def forward(self, input_ids):
+        word_embeddings = self.word_embeddings(input_ids)
+        embeddings = self.embedding_dropout(word_embeddings)
+        return embeddings
+
+
+class Embedding(nn.Module):
+    """Construct the trainable embedding module, which does not support parallelization.
+    This can be used for positional embedding and token type embedding.
+
+    Arguments:
+        num_embeddings: size of vocabulary.
+        embedding_dim: dimension of embeddings.
+        padding_idx: pad index. Defaults to None.
+        init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
+        amp_enabled: fp16 option for embedding weight. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        init_method=init.xavier_normal_,
+        amp_enabled=False,
+        layer_idx=0,
+    ):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert (
+                    padding_idx < self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+            elif padding_idx < 0:
+                assert (
+                    padding_idx >= -self.num_embeddings
+                ), "Padding_idx must be within num_embeddings"
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.init_method = init_method
+        self.amp_enabled = amp_enabled
+
+        assert num_embeddings > 0
+        self.weight = nn.Parameter(
+            flow.empty(
+                (num_embeddings, embedding_dim),
+                dtype=flow.float32,
+                placement=dist.get_layer_placement(layer_idx),
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            )
+        )
+        self.init_method(self.weight)
+
+    def forward(self, input_ids):
+        weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
+        input_embeds = flow._C.gather(weight, input_ids, axis=0)
+        return input_embeds
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with flow.no_grad():
+                self.weight[self.padding_idx] = flow.zeros(
+                    self.embedding_dim,
+                    placement=dist.get_layer_placement(0),
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                )
+
+    def extra_repr(self) -> str:
+        s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
+        if self.padding_idx is not None:
+            s += ", padding_idx={padding_idx}"
+        return s.format(**self.__dict__)
--- a/projects/MT5/layers/lm_head_layer.py
+++ b/projects/MT5/layers/lm_head_layer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from oneflow import nn
+
+from libai.layers import Linear, LMLogits
+
+
+class LMHead(nn.Module):
+    def __init__(self, model_type, hidden_size, vocab_size, hidden_layers):
+        super().__init__()
+        if model_type == "mt5":
+            self.lm_head = Linear(
+                hidden_size, vocab_size, bias=False, layer_idx=2 * hidden_layers - 1
+            )
+        else:
+            self.lm_head = LMLogits(vocab_size, bias=True)
+
+    def forward(self, decoder_states, embed_weight=None):
+        if isinstance(self.lm_head, Linear):
+            logits = self.lm_head(decoder_states)
+        else:
+            logits = self.lm_head(decoder_states, embed_weight)
+        return logits
--- a/projects/MT5/layers/logits_layer.py
+++ b/projects/MT5/layers/logits_layer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.layers import Linear
+from libai.utils import distributed as dist
+
+
+class LMLogits(nn.Module):
+    def __init__(self, vocab_size, hidden_size=None, bias=False, model_type="t5", layer_idx=-1):
+        super().__init__()
+        self.model_type = model_type
+        if model_type == "t5":
+            self.bias = (
+                nn.Parameter(
+                    flow.zeros(
+                        (vocab_size,),
+                        dtype=flow.float32,
+                        placement=dist.get_layer_placement(layer_idx),
+                        sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.split(0)]),
+                    )
+                )
+                if bias
+                else None
+            )
+        elif model_type == "mt5":
+            self.linear = Linear(hidden_size, vocab_size, bias=False, layer_idx=layer_idx)
+
+    def forward(self, input, word_embeddings=None):
+        if self.model_type == "t5":
+            w = word_embeddings.to_global(placement=input.placement)
+            input = input.to_global(grad_sbp=input.sbp)
+            logits = flow._C.matmul(input, w, transpose_b=True)
+            if self.bias is not None:
+                logits = logits + self.bias
+        else:
+            logits = self.linear(input)
+        return logits