bloom

9fdb7dab · yuguo960516 · 9fdb7dab · 9fdb7dab · 9fdb7dab · 9fdb7dab
Commit 9fdb7dab authored Mar 30, 2023 by yuguo960516
20 changed files
--- a/projects/MAE/configs/mae_pretraining.py
+++ b/projects/MAE/configs/mae_pretraining.py
+from flowvision.transforms import transforms, InterpolationMode
+from flowvision.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from libai.config import LazyCall, get_config
+from configs.models.mae_vit_base_patch16 import model
+from data.pretraining_imagenet import PretrainingImageNetDataset
+from utils.lr_decay import param_groups_weight_decay
+from utils.scheduler import warmup_cosine_lr_scheduler
+train = get_config("common/train.py").train
+optim = get_config("common/optim.py").optim
+graph = get_config("common/models/graph.py").graph
+dataloader = get_config("common/data/imagenet.py").dataloader
+# MAE Graph training for faster speed
+graph.enabled = True
+# Refine data path to imagenet
+dataloader.train.dataset[0].root = "/path/to/imagenet"
+dataloader.train.dataset[0]._target_ = PretrainingImageNetDataset
+# No test data for pretraining
+del dataloader.test
+# Refine data transform to MAE's default settings
+transform_train = LazyCall(transforms.Compose)(
+    transforms=[
+        LazyCall(transforms.RandomResizedCrop)(
+            size=(224, 224),
+            scale=(0.2, 1.0),
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        LazyCall(transforms.RandomHorizontalFlip)(),
+        LazyCall(transforms.ToTensor)(),
+        LazyCall(transforms.Normalize)(
+            mean=IMAGENET_DEFAULT_MEAN,
+            std=IMAGENET_DEFAULT_STD,
+        ),
+    ]
+)
+dataloader.train.dataset[0].transform = transform_train
+# number devices
+n_gpus = 8
+# Refine training settings for MAE
+train.train_micro_batch_size = 64
+train.num_accumulation_steps = 8
+effective_batch_size = train.train_micro_batch_size * train.num_accumulation_steps * n_gpus
+train.train_epoch = 800
+train.warmup_ratio = 40 / 800
+train.log_period = 20
+train.checkpointer.save_model_after_n_epoch = 20
+# enable activation checkpointing
+# train.activation_checkpoint.enabled = True
+# set rdma enabled when num nodes > 1
+# train.rdma_enabled = False
+# Base learning in MAE is set to 1.5e-4
+# The actually learning rate should be computed by linear scaling rule as follows:
+# lr = base_lr * batch_size / 256
+# In LiBai, you should refine the actually learning rate due to your on settings
+# Here we use 8 GPUs, 128 batch_size per GPU for training, batch_size equals to 1024
+base_lr = 1.5e-4
+actual_lr = base_lr * effective_batch_size / 256
+# Refine optim settings
+optim.params._target_ = param_groups_weight_decay
+optim.params.weight_decay = 0.05
+optim.lr = actual_lr
+optim.betas = (0.9, 0.95)
+del optim.params.clip_grad_max_norm
+del optim.params.clip_grad_norm_type
+del optim.params.weight_decay_norm
+del optim.params.weight_decay_bias
+del optim.weight_decay
+# Refine scheduler
+# Default scheduler in LiBai training config is WarmupCosineLR
+train.scheduler = LazyCall(warmup_cosine_lr_scheduler)(
+    warmup_factor=0.0,
+    min_lr=0.0,
+)
+# AMP
+train.amp.enabled = True
+# Distributed Settings
+train.dist.data_parallel_size = n_gpus
+train.dist.tensor_parallel_size = 1
+train.dist.pipeline_parallel_size = 1
+# train.dist.pipeline_num_layers = model.depth
--- a/projects/MAE/configs/models/mae_vit_base_patch16.py
+++ b/projects/MAE/configs/models/mae_vit_base_patch16.py
+from functools import partial
+from libai.config import LazyCall
+from libai.layers import LayerNorm
+from modeling.mae import MaskedAutoencoderViT
+model = LazyCall(MaskedAutoencoderViT)(
+    img_size=224,
+    patch_size=16,
+    in_chans=3,
+    embed_dim=768,
+    depth=12,
+    num_heads=12,
+    decoder_embed_dim=512,
+    decoder_depth=8,
+    decoder_num_heads=16,
+    mlp_ratio=4,
+    norm_layer=partial(LayerNorm, eps=1e-6),
+    norm_pix_loss=True,
+    mask_ratio=0.75,
+)
--- a/projects/MAE/configs/models/mae_vit_huge_patch14.py
+++ b/projects/MAE/configs/models/mae_vit_huge_patch14.py
+from .mae_vit_base_patch16 import model
+model.patch_size = 14
+model.embed_dim = 1280
+model.depth = 32
+model.num_heads = 16
--- a/projects/MAE/configs/models/mae_vit_large_patch16.py
+++ b/projects/MAE/configs/models/mae_vit_large_patch16.py
+from .mae_vit_base_patch16 import model
+model.embed_dim = 1024
+model.depth = 24
+model.num_heads = 16
--- a/projects/MAE/configs/models/vit_base_patch16.py
+++ b/projects/MAE/configs/models/vit_base_patch16.py
+from libai.config import LazyCall
+from modeling.vit import VisionTransformer
+model = LazyCall(VisionTransformer)(
+    img_size=224,
+    patch_size=16,
+    in_chans=3,
+    embed_dim=768,
+    depth=12,
+    num_heads=12,
+    mlp_ratio=4,
+    drop_path_rate=0.1,
+    global_pool=True,
+)
--- a/projects/MAE/configs/models/vit_huge_patch14.py
+++ b/projects/MAE/configs/models/vit_huge_patch14.py
+from .vit_base_patch16 import model
+model.patch_size = 14
+model.embed_dim = 1280
+model.depth = 32
+model.num_heads = 16
+model.drop_path_rate = 0.2
--- a/projects/MAE/configs/models/vit_large_patch16.py
+++ b/projects/MAE/configs/models/vit_large_patch16.py
+from .vit_base_patch16 import model
+model.embed_dim = 1024
+model.depth = 24
+model.num_heads = 16
--- a/projects/MAE/data/pretraining_imagenet.py
+++ b/projects/MAE/data/pretraining_imagenet.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from libai.data.datasets.imagenet import ImageNetDataset
+from libai.data.structures import Instance
+class PretrainingImageNetDataset(ImageNetDataset):
+    """ImageNet Dataset in LiBai for Pretraining
+    Return:
+        images: ImageNet train set images
+    """
+    def __getitem__(self, index: int):
+        data_sample = super().__getitem__(index)
+        return Instance(images=data_sample.get("images"))
--- a/projects/MAE/modeling/cross_entropy.py
+++ b/projects/MAE/modeling/cross_entropy.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import oneflow as flow
+import oneflow.nn as nn
+class SoftTargetCrossEntropy(nn.Module):
+    def __init__(self):
+        super(SoftTargetCrossEntropy, self).__init__()
+    def forward(self, x: flow.Tensor, target: flow.Tensor) -> flow.Tensor:
+        pred = flow.log_softmax(x, dim=-1)
+        loss = -target * pred
+        # sum and mean should be calculated with float32
+        # amp_white_identity ensure -target * pred using float16
+        # amp_black_identity ensure sum and mean using float32
+        loss = flow._C.amp_white_identity(loss)
+        loss = flow._C.amp_black_identity(loss)
+        loss = flow.sum(loss, dim=-1)
+        return loss.mean()
--- a/projects/MAE/modeling/mae.py
+++ b/projects/MAE/modeling/mae.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------
+# MAE Model
+# References:
+# mae: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import oneflow as flow
+import oneflow.nn as nn
+import libai.utils.distributed as dist
+from libai.config import configurable
+from libai.layers import LayerNorm, Linear, PatchEmbedding, TransformerLayer
+from .pos_embed import get_2d_sincos_pos_embed
+class MaskedAutoencoderViT(nn.Module):
+    """Masked Autoencoder with VisionTransformer backbone"""
+    @configurable
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        decoder_embed_dim=512,
+        decoder_depth=8,
+        decoder_num_heads=16,
+        mlp_ratio=4.0,
+        norm_layer=LayerNorm,
+        norm_pix_loss=False,
+        mask_ratio=0.75,
+    ):
+        super().__init__()
+        self.mask_ratio = mask_ratio
+        # --------------------------------------------------------------------------
+        # MAE encoder specifics
+        self.patch_embed = PatchEmbedding(img_size, patch_size, in_chans, embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(
+            flow.zeros(
+                1,
+                1,
+                embed_dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(0),
+            )
+        )
+        self.pos_embed = nn.Parameter(
+            flow.zeros(
+                1,
+                num_patches + 1,
+                embed_dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(0),
+            )
+        )
+        self.blocks = nn.ModuleList(
+            [
+                TransformerLayer(
+                    hidden_size=embed_dim,
+                    ffn_hidden_size=int(embed_dim * mlp_ratio),
+                    num_attention_heads=num_heads,
+                    layer_idx=i,
+                )
+                for i in range(depth)
+            ]
+        )
+        # TODO: set norm layer placement stage id
+        self.norm = norm_layer(embed_dim, layer_idx=depth)
+        # --------------------------------------------------------------------------
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.decoder_embed = Linear(embed_dim, decoder_embed_dim, bias=True, layer_idx=depth)
+        self.mask_token = nn.Parameter(
+            flow.zeros(
+                1,
+                1,
+                decoder_embed_dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(depth),
+            )
+        )
+        self.decoder_pos_embed = nn.Parameter(
+            flow.zeros(
+                1,
+                num_patches + 1,
+                decoder_embed_dim,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=dist.get_layer_placement(depth),
+            )
+        )
+        self.decoder_blocks = nn.ModuleList(
+            [
+                TransformerLayer(
+                    hidden_size=decoder_embed_dim,
+                    ffn_hidden_size=int(decoder_embed_dim * mlp_ratio),
+                    num_attention_heads=decoder_num_heads,
+                    layer_idx=(i + depth),
+                )
+                for i in range(decoder_depth)
+            ]
+        )
+        self.decoder_norm = norm_layer(decoder_embed_dim, layer_idx=-1)
+        self.decoder_pred = Linear(
+            decoder_embed_dim, patch_size ** 2 * in_chans, bias=True, layer_idx=-1
+        )  # decoder to patch
+        # --------------------------------------------------------------------------
+        self.norm_pix_loss = norm_pix_loss
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialization
+        # initialize (and freeze) pos_embed by sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(
+            self.pos_embed.shape[-1], int(self.patch_embed.num_patches ** 0.5), cls_token=True
+        )
+        self.pos_embed.data.copy_(
+            flow.from_numpy(pos_embed)
+            .float()
+            .unsqueeze(0)
+            .to_global(
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=self.pos_embed.placement,
+            )
+        )
+        decoder_pos_embed = get_2d_sincos_pos_embed(
+            self.decoder_pos_embed.shape[-1],
+            int(self.patch_embed.num_patches ** 0.5),
+            cls_token=True,
+        )
+        self.decoder_pos_embed.data.copy_(
+            flow.from_numpy(decoder_pos_embed)
+            .float()
+            .unsqueeze(0)
+            .to_global(
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=self.decoder_pos_embed.placement,
+            )
+        )
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        flow.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        flow.nn.init.normal_(self.cls_token, std=0.02)
+        flow.nn.init.normal_(self.mask_token, std=0.02)
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, Linear):
+            # we use xavier_uniform following official JAX ViT:
+            flow.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @classmethod
+    def from_config(cls, cfg):
+        return {
+            "img_size": cfg.img_size,
+            "patch_size": cfg.patch_size,
+            "in_chans": cfg.in_chans,
+            "embed_dim": cfg.embed_dim,
+            "depth": cfg.depth,
+            "num_heads": cfg.num_heads,
+            "decoder_embed_dim": cfg.decoder_embed_dim,
+            "decoder_depth": cfg.decoder_depth,
+            "decoder_num_heads": cfg.decoder_num_heads,
+            "mlp_ratio": cfg.mlp_ratio,
+            "norm_layer": cfg.norm_layer,
+            "norm_pix_loss": cfg.norm_pix_loss,
+            "mask_ratio": cfg.mask_ratio,
+        }
+    def patchify(self, imgs):
+        """
+        imgs: (N, 3, H, W)
+        x: (N, L, patch_size**2 *3)
+        """
+        p = self.patch_embed.patch_size[0]
+        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+        h = w = imgs.shape[2] // p
+        x = imgs.reshape(imgs.shape[0], 3, h, p, w, p)
+        # TODO: replace permute with flow.einsum
+        # (n c h p w q) -> (n h w p q c)
+        x = x.permute(0, 2, 4, 3, 5, 1)
+        x = x.reshape(imgs.shape[0], h * w, p ** 2 * 3)
+        return x
+    def unpatchify(self, x):
+        """
+        x: (N, L, patch_size**2 *3)
+        imgs: (N, 3, H, W)
+        """
+        p = self.patch_embed.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(x.shape[0], h, w, p, p, 3)
+        # TODO: replace permute with flow.einsum
+        # (n h w p q c) -> (n c h p w q)
+        x = x.permute(0, 5, 1, 3, 2, 4)
+        imgs = x.reshape(x.shape[0], 3, h * p, h * p)
+        return imgs
+    def random_masking(self, x, mask_ratio):
+        """
+        Perform per-sample random masking by per-sample shuffling.
+        Per-sample shuffling is done by argsort random noise.
+        x: [N, L, D], sequence
+        """
+        N, L, D = x.shape
+        len_keep = int(L * (1 - mask_ratio))
+        noise = flow.rand(N, L, sbp=x.sbp, placement=x.placement)  # noise in [0, 1]
+        # sort noise for each sample
+        ids_shuffle = flow.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+        ids_restore = flow.argsort(ids_shuffle, dim=1)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        x_masked = flow.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = flow.ones([N, L], sbp=x.sbp, placement=x.placement)
+        mask[:, :len_keep] = 0
+        # unshuffle to get binary mask
+        mask = flow.gather(mask, dim=1, index=ids_restore)
+        return x_masked, mask, ids_restore
+    def forward_encoder(self, x, mask_ratio):
+        # embed patches
+        x = self.patch_embed(x)
+        # add pos embed w/o cls token
+        x = x + self.pos_embed[:, 1:, :]
+        # masking: length -> length * mask_ratio
+        x, mask, ids_restore = self.random_masking(x, mask_ratio)
+        # append cls token
+        cls_token = self.cls_token + self.pos_embed[:, :1, :]
+        # Directly expanding cls_token (with shape=(1, 1, D) and sbp=B)
+        # will produce a huge tensor with shape [B*N, 1, D]
+        # (while B = local batch size, N = total num devices),
+        # however we only need an expanded cls_token with shape [B, 1, D],
+        # meanwhile local to global tensor is not avaible in graph mode for now,
+        # we have to use a two stage expanding way to expand cls_token as below.
+        world_size = flow.env.get_world_size()
+        # repeat to (N, 1, D), sbp = B
+        cls_token = cls_token.expand(world_size, -1, -1)
+        # to_global(sbp=S(0)), local shape = (1, 1, D)
+        cls_token = cls_token.to_global(sbp=x.sbp)
+        # second expand from (N, 1, D) to (B*N, 1, D)
+        # (global shape, sbp=S(0)), local shape=(B, 1, D),
+        # by this way we wouldn't produce a (B*N, 1, D) tensor in local view.
+        cls_tokens = cls_token.repeat(x.shape[0] // world_size, 1, 1)
+        x = flow.cat((cls_tokens, x), dim=1)
+        # apply Transformer blocks
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x, mask, ids_restore
+    def forward_decoder(self, x, ids_restore):
+        # embed tokens
+        x = self.decoder_embed(x)
+        # append mask tokens to sequence
+        # mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1)
+        # The line above will produce a huge mask_tokens with shape [B*N, L, D]
+        # (while B = local batch size, N = total num devices),
+        # actually we only need a mask_tokens with shape [B, L, D] in local view,
+        # meanwhile local to global tensor is not avaible in graph mode for now,
+        # we have to use a two stage repeat way as below.
+        world_size = flow.env.get_world_size()
+        # repeat to (N, 1, D), sbp = B
+        mask_token = self.mask_token.repeat(world_size, 1, 1)
+        # to_global(sbp=S(0)), local shape = (1, 1, D)
+        mask_token = mask_token.to_global(sbp=x.sbp)
+        # second repeat from (N, 1, D) to (B*N, L, D)
+        # (global shape, sbp=S(0)), local shape = (B, L, D),
+        # and the originally huge mask_tokens with shape (B*N, L, D)
+        # wouldn't be produced in local view.
+        mask_tokens = mask_token.repeat(
+            x.shape[0] // world_size, ids_restore.shape[1] + 1 - x.shape[1], 1
+        )
+        x_ = flow.cat([x[:, 1:, :], mask_tokens], dim=1)  # no cls token
+        x_ = flow.gather(
+            x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])
+        )  # unshuffle
+        x = flow.cat([x[:, :1, :], x_], dim=1)  # append cls token
+        # add pos embed
+        x = x + self.decoder_pos_embed
+        # apply Transformer blocks
+        for blk in self.decoder_blocks:
+            x = blk(x)
+        x = self.decoder_norm(x)
+        # predictor projection
+        x = self.decoder_pred(x)
+        # remove cls token
+        x = x[:, 1:, :]
+        return x
+    def forward_loss(self, imgs, pred, mask):
+        """
+        imgs: [N, 3, H, W]
+        pred: [N, L, p*p*3]
+        mask: [N, L], 0 is keep, 1 is remove,
+        """
+        target = self.patchify(imgs)
+        if self.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.0e-6) ** 0.5
+        loss = (pred - target) ** 2
+        # We want the prev loss to be calculated with float16,
+        # and mean/sum below to be calculated with float32.
+        # this amp_white_identity will affect preceding ops to be float16
+        loss = flow._C.amp_white_identity(loss)
+        # this amp_black_identity will affect succeeding ops to be float32
+        loss = flow._C.amp_black_identity(loss)
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+    def forward(self, images):
+        latent, mask, ids_restore = self.forward_encoder(images, self.mask_ratio)
+        pred = self.forward_decoder(latent, ids_restore)  # [N, L, p*p*3]
+        loss = self.forward_loss(images, pred, mask)
+        if self.training:
+            return {"losses": loss}
+        else:
+            return {
+                "losses": loss,
+                "pred": pred,
+                "mask": mask,
+            }
--- a/projects/MAE/modeling/pos_embed.py
+++ b/projects/MAE/modeling/pos_embed.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    Arguments:
+        embed_dim: hidden_size of the input tokens
+        grid_size: int of the grid height and width
+        cls_token: with cls_token or not
+    Return:
+        pos_embed: [grid_size*grid_size, embed_dim]
+        or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
--- a/projects/MAE/modeling/vit.py
+++ b/projects/MAE/modeling/vit.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------
+# ViT Model
+# References:
+# mae: https://github.com/facebookresearch/mae/blob/main/models_vit.py
+# --------------------------------------------------------
+import oneflow as flow
+import libai.models.vision_transformer
+class VisionTransformer(libai.models.vision_transformer.VisionTransformer):
+    """Vision Transformer for MAE
+    LiBai impl of: `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        global_pool=False,
+        num_classes=1000,
+        loss_func=None,
+    ):
+        super(VisionTransformer, self).__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            drop_rate=drop_rate,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            num_classes=num_classes,
+            loss_func=loss_func,
+        )
+        self.global_pool = global_pool
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+    def forward_head(self, x):
+        if self.global_pool:
+            x = x[:, 1:, :]  # global pool without cls token
+            # we want mean to be calculated with float32
+            # the amp_white_identity pair make the calculation before and after mean using float16
+            # the amp_black_identity pair make mean using float32
+            x = flow._C.amp_white_identity(x)
+            x = flow._C.amp_black_identity(x)
+            x = x.mean(dim=1)
+            x = flow._C.amp_black_identity(x)
+            x = flow._C.amp_white_identity(x)
+            outcome = self.norm(x)
+            outcome = self.head(outcome)
+        else:
+            x = self.norm(x)
+            outcome = x[:, 0]
+            outcome = self.head(outcome)
+        return outcome
--- a/projects/MAE/train_net.py
+++ b/projects/MAE/train_net.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import sys
+import numpy as np
+import oneflow as flow
+from utils.weight_convert import load_torch_checkpoint
+from libai.config import LazyConfig, default_argument_parser, try_get_key
+from libai.engine import DefaultTrainer, default_setup
+from libai.utils.checkpoint import Checkpointer
+sys.path.append(".")
+logger = logging.getLogger("libai.mae." + __name__)
+class Trainer(DefaultTrainer):
+    @classmethod
+    def build_model(cls, cfg):
+        model = super().build_model(cfg)
+        if try_get_key(cfg, "finetune") is not None:
+            if cfg.finetune.enable is True:
+                logger.info("Loading pretrained weight for finetuning")
+                assert cfg.finetune.weight_style in ["oneflow", "pytorch"]
+                if cfg.finetune.weight_style == "oneflow":
+                    Checkpointer(model).load(cfg.finetune.path)
+                elif cfg.finetune.weight_style == "pytorch":
+                    model = load_torch_checkpoint(model, cfg, path=cfg.finetune.path, strict=False)
+                else:
+                    raise NotImplementedError(
+                        "Only support loading oneflow & pytorch pretrained weight now."
+                    )
+        return model
+def main(args):
+    cfg = LazyConfig.load(args.config_file)
+    cfg = LazyConfig.apply_overrides(cfg, args.opts)
+    default_setup(cfg, args)
+    if args.fast_dev_run:
+        cfg.train.train_epoch = 0
+        cfg.train.checkpointer.period = 5
+        cfg.train.train_iter = 10
+        cfg.train.evaluation.eval_period = 10
+        cfg.train.log_period = 1
+    if args.eval_only:
+        cfg.eval_only = True
+        tokenizer = None
+        if try_get_key(cfg, "tokenization.setup", default=False):
+            tokenizer = Trainer.build_tokenizer(cfg)
+        model = Trainer.build_model(cfg)
+        Checkpointer(model, save_dir=cfg.train.output_dir).resume_or_load(
+            cfg.train.load_weight, resume=args.resume
+        )
+        if try_get_key(cfg, "train.graph.enabled", default=False):
+            model = Trainer.build_graph(cfg, model, is_train=False)
+        test_loader = Trainer.build_test_loader(cfg, tokenizer)
+        if len(test_loader) == 0:
+            logger.info("No dataset in dataloader.test, please set dataset for dataloader.test")
+        _ = Trainer.test(cfg, test_loader, model)
+        return
+    # manual different seed for each rank
+    seed_for_rank = cfg.train.seed + flow.env.get_rank()
+    flow.manual_seed(seed_for_rank)
+    flow.cuda.manual_seed(seed_for_rank)
+    np.random.seed(seed_for_rank)
+    random.seed(seed_for_rank)
+    trainer = Trainer(cfg)
+    return trainer.train()
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    main(args)
--- a/projects/MAE/utils/lr_decay.py
+++ b/projects/MAE/utils/lr_decay.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------
+# References:
+# mae: https://github.com/facebookresearch/mae/blob/main/util/lr_decay.py
+# --------------------------------------------------------
+import logging
+logger = logging.getLogger("libai.mae." + __name__)
+def param_groups_lrd(model, weight_decay=0.05, layer_decay=0.75):
+    """
+    Parameter groups for layer-wise lr decay
+    Modified from BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
+    """
+    param_group_names = {}
+    param_groups = {}
+    no_weight_decay_list = model.no_weight_decay()
+    num_layers = len(model.blocks) + 1
+    layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1))
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue
+        if param.ndim == 1 or name in no_weight_decay_list:
+            g_decay = "no_decay"
+            this_decay = 0.0
+        else:
+            g_decay = "decay"
+            this_decay = weight_decay
+        layer_idx = get_layer_idx_for_vit(name, num_layers)
+        group_name = "layer_%d_%s" % (layer_idx, g_decay)
+        # logger.info(
+        #   f"{name}, shape={param.shape}, {g_decay}={this_decay}"
+        #   f", layer_scale={layer_scales[layer_idx]}"
+        # )
+        if group_name not in param_group_names:
+            this_scale = layer_scales[layer_idx]
+            param_group_names[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+            param_groups[group_name] = {
+                "lr_scale": this_scale,
+                "weight_decay": this_decay,
+                "params": [],
+            }
+        param_group_names[group_name]["params"].append(name)
+        param_groups[group_name]["params"].append(param)
+    return list(param_groups.values())
+def get_layer_idx_for_vit(name, num_layers):
+    """
+    Assign a parameter with its layer id
+    Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
+    """
+    if name in ["cls_token", "pos_embed"]:
+        return 0
+    elif name.startswith("patch_embed"):
+        return 0
+    elif name.startswith("blocks"):
+        return int(name.split(".")[1]) + 1
+    else:
+        return num_layers
+# Refer to: add_weight_decay in
+# https://github.com/rwightman/pytorch-image-models/blob/v0.3.3/timm/optim/optim_factory.py
+def param_groups_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    decay_params = []
+    no_decay_params = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+            no_decay_params.append(param)
+        else:
+            decay_params.append(param)
+    return [
+        {"params": no_decay_params, "weight_decay": 0.0},
+        {"params": decay_params, "weight_decay": weight_decay},
+    ]
--- a/projects/MAE/utils/scheduler.py
+++ b/projects/MAE/utils/scheduler.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import math
+import oneflow as flow
+from oneflow.optim.lr_scheduler import _LRScheduler
+logger = logging.getLogger(__name__)
+class LayerScaleWarmupCosineDecayLR(_LRScheduler):
+    def __init__(
+        self,
+        optimizer: flow.optim.Optimizer,
+        steps: int,
+        warmup_steps: int,
+        warmup_factor: float,
+        min_lr: float = 0.0,
+        last_step: int = -1,
+        verbose: bool = False,
+    ):
+        self.total_steps = steps
+        self.decay_steps = steps - warmup_steps
+        self.warmup_steps = warmup_steps
+        self.warmup_factor = warmup_factor
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_step, verbose)
+    def get_lr(self, base_lr, step):
+        if step < self.warmup_steps:
+            progress = step / self.warmup_steps
+            lr = base_lr * progress
+        elif step < self.total_steps:
+            progress = (step - self.warmup_steps) / self.decay_steps
+            lr = self.min_lr + (base_lr - self.min_lr) * 0.5 * (1.0 + math.cos(math.pi * progress))
+        else:
+            lr = self.min_lr
+        return lr
+    def update_lrs(self, lrs):
+        self._last_lr = []
+        for i, (group, lr) in enumerate(zip(self.optimizer.param_groups, lrs)):
+            if "lr_scale" in group:
+                group["lr"] = lr * group["lr_scale"]
+            else:
+                group["lr"] = lr
+            self._last_lr.append(lr)
+            if self.verbose:
+                self.print_lr(i, lr)
+def warmup_layerscale_cosine_lr_scheduler(
+    optimizer: flow.optim.Optimizer,
+    max_iter: int,
+    warmup_iter: int,
+    warmup_factor: float,
+    min_lr: float = 0.0,
+):
+    return LayerScaleWarmupCosineDecayLR(
+        optimizer,
+        steps=max_iter,
+        warmup_steps=warmup_iter,
+        warmup_factor=warmup_factor,
+        min_lr=min_lr,
+    )
+def warmup_cosine_lr_scheduler(
+    optimizer: flow.optim.Optimizer,
+    max_iter: int,
+    warmup_iter: int,
+    warmup_factor: float = 0.0,
+    warmup_method: str = "linear",
+    min_lr: float = 0.0,
+):
+    cosine_lr = flow.optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=max_iter - warmup_iter, eta_min=min_lr
+    )
+    if warmup_iter == 0:
+        logger.warning("warmup iters equals to zero, return CosineLR")
+        return cosine_lr
+    if warmup_iter > max_iter:
+        logger.warning("warmup iters is larger than the total training iters")
+    warmup_cosine_lr = flow.optim.lr_scheduler.WarmupLR(
+        cosine_lr,
+        warmup_factor=warmup_factor,
+        warmup_iters=warmup_iter,
+        warmup_method=warmup_method,
+    )
+    return warmup_cosine_lr
--- a/projects/MAE/utils/weight_convert.py
+++ b/projects/MAE/utils/weight_convert.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import oneflow as flow
+import torch
+from flowvision.layers.weight_init import trunc_normal_
+logger = logging.getLogger("libai.mae." + __name__)
+def convert_qkv_weight(cfg, value):
+    """
+    Convert qkv.weight to be compatible with LiBai transformer layer
+    Args:
+        cfg: config file
+        value: qkv.weight in the loaded checkpoint
+    """
+    num_heads = cfg.model.num_heads
+    hidden_size = cfg.model.embed_dim
+    head_size = int(hidden_size / num_heads)
+    qkv_weight = (
+        value.view([3, num_heads, head_size, hidden_size])
+        .permute(1, 0, 2, 3)
+        .contiguous()
+        .view(hidden_size * 3, hidden_size)
+    )
+    return qkv_weight
+def convert_qkv_bias(cfg, value):
+    """
+    Convert qkv.bias to be compatible with LiBai transformer layer
+    Args:
+        cfg: config file
+        value: qkv.bias in the loaded checkpoint
+    """
+    num_heads = cfg.model.num_heads
+    hidden_size = cfg.model.embed_dim
+    head_size = int(hidden_size / num_heads)
+    qkv_bias = (
+        value.view(3, num_heads, head_size).permute(1, 0, 2).contiguous().view(hidden_size * 3)
+    )
+    return qkv_bias
+def filter_keys(key, value, cfg):
+    """
+    Filtering the state_dict keys and values to match LiBai's MAE model
+    """
+    if key.startswith("decoder_"):
+        value = None
+    elif "norm1" in key:
+        key = key.replace("norm1", "input_layernorm")
+    elif "attn.qkv" in key:
+        key = key.replace("attn.qkv", "self_attention.query_key_value")
+        if "weight" in key:
+            value = convert_qkv_weight(cfg, value)
+        if "bias" in key:
+            value = convert_qkv_bias(cfg, value)
+    elif "attn.proj" in key:
+        key = key.replace("attn.proj", "self_attention.dense")
+    elif "norm2" in key:
+        key = key.replace("norm2", "post_attention_layernorm")
+    elif "mlp.fc1" in key:
+        key = key.replace("mlp.fc1", "mlp.dense_h_to_4h")
+    elif "mlp.fc2" in key:
+        key = key.replace("mlp.fc2", "mlp.dense_4h_to_h")
+    elif "fc_norm" in key:
+        key = key.replace("fc_norm", "norm")
+    elif key == "norm.weight" or key == "norm.bias":
+        value = None
+    return key, value
+def log_param(key, value):
+    logger.info(f"{key}, shape={value.shape}")
+def load_torch_checkpoint(model, cfg, path="./mae_finetuned_vit_base.pth", strict=False):
+    """
+    Load checkpoint from the given torch weights.
+    Torch weight can be downloaded from the original repo:
+        https://github.com/facebookresearch/mae
+    """
+    torch_dict = torch.load(path, map_location="cpu")["model"]
+    parameters = torch_dict
+    new_parameters = dict()
+    for key, value in parameters.items():
+        # log_param(key, value)
+        if "num_batches_tracked" not in key:
+            # to global tensor
+            key, val = filter_keys(key, value, cfg)
+            if val is None:
+                continue
+            val = val.detach().cpu().numpy()
+            val = flow.tensor(val).to_global(
+                sbp=flow.sbp.broadcast, placement=flow.placement("cuda", ranks=[0])
+            )
+            new_parameters[key] = val
+    msg = model.load_state_dict(new_parameters, strict=strict)
+    logger.info(msg)
+    if not cfg.eval_only:
+        trunc_normal_(model.head.weight, std=2e-5)
+    logger.info("Successfully load torch mae checkpoint.")
+    return model
--- a/projects/MOCOV3/README.md
+++ b/projects/MOCOV3/README.md
+## MOCOv3 in LiBai
+**An Empirical Study of Training Self-Supervised Vision Transformers**
+Xinlei Chen, Saining Xie, Kaiming He
+[[`arXiv`](https://arxiv.org/abs/2104.02057)] [[`BibTeX`](#Citation)]
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/34954782/161363870-eb672518-deee-4754-b30f-be59ea91ac7e.png" width="480">
+</p>
+This is the OneFlow re-implementation of MOCOv3 based on [LiBai](https://libai.readthedocs.io/).
+## Catelog
+- [x] MOCOv3 pretraining code
+- [x] MOCOv3 linear prob code
+## Supported parallel mode and task
+Based on [libai.layers](https://libai.readthedocs.io/en/latest/modules/libai.layers.html), MOCOv3 model is automatically configured with the following parallelism mode.
+<table class="docutils">
+  <tbody>
+    <tr>
+      <th width="80"> Model </th>
+      <th valign="bottom" align="left" width="120">Data Parallel</th>
+      <th valign="bottom" align="left" width="120">Tensor Parallel</th>
+      <th valign="bottom" align="left" width="120">Pipeline Parallel</th>
+    </tr>
+    <tr>
+      <td align="left"> <b> MOCOv3 pretrain </b> </td>
+      <td align="left">&#10004;</td>
+      <td align="left">-</td>
+      <td align="left">-</td>
+    </tr>
+    <tr>
+      <td align="left"> <b> MOCOv3 linear prob </b> </td>
+      <td align="left">&#10004;</td>
+      <td align="left">&#10004;</td>
+      <td align="left">&#10004;</td>
+    </tr>
+  </tbody>
+</table>
+## Usage
+### Installation
+Please see [LiBai Installation](https://libai.readthedocs.io/en/latest/tutorials/get_started/Installation.html) to install LiBai
+### Prepare the Data
+Please see [Prepare the Data](https://libai.readthedocs.io/en/latest/tutorials/get_started/quick_run.html#prepare-the-data).
+### Pretraining
+Pretraining MOCOv3 on 8 GPUs using data parallelism.
+```bash
+cd /path/to/libai
+bash tools/train.sh projects/MOCOV3/pretrain_net.py projects/MOCOV3/configs/moco_pretrain.py 8
+```
+### Linear Prob
+1. Setup the weights for linear prob in [moco_linear_prob.py](./configs/moco_linear_prob.py) as follows:
+```python
+# moco_linear_prob.py
+# Path to the weight for linear prob
+model.linear_prob = "path/to/pretrained_weight"
+model.weight_style = "oneflow"
+```
+If you feel confused about the checkpoint format here, please refer to [Load and Save a Checkpoint in LiBai](https://libai.readthedocs.io/en/latest/tutorials/basics/Load_and_Save_Checkpoint.html) for more details.
+2. The MOCOv3 linear prob on 8 GPUs using data parallelism.
+```bash
+cd /path/to/libai
+bash tools/train.sh tools/train_net.py projects/MOCOV3/configs/moco_linear_prob.py 8
+```
+**Notes:** if you want to run the MOCOv3 linear prob models using different parallel strategies, please refer to the [Distributed Configuration Tutorial](https://libai.readthedocs.io/en/latest/tutorials/basics/Distributed_Configuration.html)
+### Evaluation
+Evaluate MOCOv3 model under LiBai on 8 GPUs:
+```bash
+cd /path/to/libai
+bash tools/train.sh tools/train_net.py projects/MOCOV3/configs/moco_linear_prob.py 8 --eval-only train.load_weight="path/to/pretrained_weight"
+```
+## Advanced Usage
+### The MOCOv3 linear prob with pytorch pretrained checkpoint
+You can download pytorch pretrained weight from [MOCOv3 official repo](https://github.com/facebookresearch/moco-v3/blob/main/CONFIG.md) and run linear prob in LiBai by updating the [moco_linear_prob.py](./configs/moco_linear_prob.py) as follows:
+```python
+# Path to the weight for linear prob 
+model.linear_prob =  "/path/to/vit-s-300ep.pth.tar"
+model.weight_style = "pytorch"
+```
+Run linear prob on 8 GPUs:
+```bash
+cd /path/to/libai
+bash tools/train.sh tools/train_net.py projects/MOCOV3/configs/moco_linear_prob.py 8
+```
+## Citation
+```BibTeX
+@inproceedings{chen2021empirical,
+  title={An empirical study of training self-supervised vision transformers},
+  author={Chen, Xinlei and Xie, Saining and He, Kaiming},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={9640--9649},
+  year={2021}
+}
+```
\ No newline at end of file
--- a/projects/MOCOV3/configs/moco_linear_prob.py
+++ b/projects/MOCOV3/configs/moco_linear_prob.py
+from oneflow.optim import SGD
+from flowvision.transforms import transforms
+from libai.config import get_config, LazyCall
+from .models.vit_small_patch16 import model
+from ..transform.linear_prob_transform import train_aug
+dataloader = get_config("common/data/imagenet.py").dataloader
+train = get_config("common/train.py").train
+graph = get_config("common/models/graph.py").graph
+optim = get_config("common/optim.py").optim
+# Path to the weight for fine-tune
+model.linear_prob = "path/to/pretrained_weight"
+model.weight_style = "oneflow"
+# Refine data path to imagenet
+dataloader.train.dataset[0].root = "/path/to/imagenet/"
+dataloader.test[0].dataset.root = "/path/to/imagenet/"
+# Add augmentation Func
+dataloader.train.dataset[0].transform = LazyCall(transforms.Compose)(transforms=train_aug)
+# Refine train cfg for moco v3 model
+train.train_micro_batch_size = 128
+train.test_micro_batch_size = 32
+train.train_epoch = 90
+train.log_period = 1
+train.evaluation.eval_period = 1000
+optim._target_ = SGD
+optim.params.clip_grad_max_norm = None
+optim.params.clip_grad_norm_type = None
+optim.params.weight_decay_norm = None
+optim.params.weight_decay_bias = None
+del optim.betas
+del optim.eps
+del optim.do_bias_correction
+# Refine optimizer cfg for moco v3 model
+# Reference:
+# https://github.com/facebookresearch/moco-v3/blob/main/CONFIG.md
+# https://github.com/facebookresearch/moco-v3/blob/main/main_lincls.py
+base_lr = 3.0
+actual_lr = base_lr * (train.train_micro_batch_size * 8 / 256)
+optim.lr = actual_lr
+optim.weight_decay = 0.0
+optim.momentum = 0.9
+# Scheduler
+train.scheduler.warmup_iter = 0
+train.scheduler.alpha = 0
+graph.enabled = False
--- a/projects/MOCOV3/configs/moco_pretrain.py
+++ b/projects/MOCOV3/configs/moco_pretrain.py
+from flowvision import transforms
+from libai.config import get_config, LazyCall
+from .models.moco_vit_small_patch16 import model
+from transform.pretrain_transform import TwoCropsTransform, augmentation1, augmentation2
+dataloader = get_config("common/data/imagenet.py").dataloader
+train = get_config("common/train.py").train
+graph = get_config("common/models/graph.py").graph
+optim = get_config("common/optim.py").optim
+# Refine data path to imagenet
+dataloader.train.dataset[0].root = "/path/to/imagenet/"
+dataloader.test[0].dataset.root = "/path/to/imagenet/"
+# Add augmentation Func
+dataloader.train.dataset[0].transform = LazyCall(TwoCropsTransform)(
+    base_transform1=LazyCall(transforms.Compose)(transforms=augmentation1),
+    base_transform2=LazyCall(transforms.Compose)(transforms=augmentation2),
+)
+# the momentum of MOCOV3
+model.m = 0.99
+# the temperature coefficient of MOCOV3
+model.T = 0.2
+# Refine train cfg for moco v3 model
+train.train_micro_batch_size = 32
+train.test_micro_batch_size = 32
+train.train_epoch = 300
+train.warmup_ratio = 40 / 300
+train.eval_period = 5
+train.log_period = 1
+# Refine optimizer cfg for moco v3 model
+base_lr = 1.5e-4
+actual_lr = base_lr * (train.train_micro_batch_size * 8 / 256)
+optim.lr = actual_lr
+optim.weight_decay = 0.1
+# Scheduler
+train.scheduler.warmup_factor = 0.001
+train.scheduler.alpha = 1.5e-4
+train.scheduler.warmup_method = "linear"
+graph.enabled = False
--- a/projects/MOCOV3/configs/models/moco_vit_base_patch16.py
+++ b/projects/MOCOV3/configs/models/moco_vit_base_patch16.py
+from libai.config import LazyCall
+from modeling.moco import MoCo_ViT
+from modeling.vit import VisionTransformer
+base_encoder = LazyCall(VisionTransformer)(
+    img_size=224,
+    patch_size=16,
+    in_chans=3,
+    embed_dim=768,
+    depth=12,
+    num_heads=12,
+    mlp_ratio=4,
+    drop_path_rate=0.1,
+    global_pool=False,
+    stop_grad_conv1=True,
+)
+momentum_encoder = LazyCall(VisionTransformer)(
+    img_size=224,
+    patch_size=16,
+    in_chans=3,
+    embed_dim=768,
+    depth=12,
+    num_heads=12,
+    mlp_ratio=4,
+    drop_path_rate=0.1,
+    global_pool=False,
+    stop_grad_conv1=True,
+)
+model = LazyCall(MoCo_ViT)(
+    base_encoder=base_encoder,
+    momentum_encoder=momentum_encoder,
+    dim=256,
+    mlp_dim=4096,
+    T=0.2,
+    m=0.99,
+)