Commit 9fdb7dab authored by yuguo960516's avatar yuguo960516
Browse files

bloom

parents
Pipeline #150 failed with stages
in 0 seconds
from flowvision.transforms import transforms, InterpolationMode
from flowvision.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from libai.config import LazyCall, get_config
from configs.models.mae_vit_base_patch16 import model
from data.pretraining_imagenet import PretrainingImageNetDataset
from utils.lr_decay import param_groups_weight_decay
from utils.scheduler import warmup_cosine_lr_scheduler
train = get_config("common/train.py").train
optim = get_config("common/optim.py").optim
graph = get_config("common/models/graph.py").graph
dataloader = get_config("common/data/imagenet.py").dataloader
# MAE Graph training for faster speed
graph.enabled = True
# Refine data path to imagenet
dataloader.train.dataset[0].root = "/path/to/imagenet"
dataloader.train.dataset[0]._target_ = PretrainingImageNetDataset
# No test data for pretraining
del dataloader.test
# Refine data transform to MAE's default settings
transform_train = LazyCall(transforms.Compose)(
transforms=[
LazyCall(transforms.RandomResizedCrop)(
size=(224, 224),
scale=(0.2, 1.0),
interpolation=InterpolationMode.BICUBIC,
),
LazyCall(transforms.RandomHorizontalFlip)(),
LazyCall(transforms.ToTensor)(),
LazyCall(transforms.Normalize)(
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD,
),
]
)
dataloader.train.dataset[0].transform = transform_train
# number devices
n_gpus = 8
# Refine training settings for MAE
train.train_micro_batch_size = 64
train.num_accumulation_steps = 8
effective_batch_size = train.train_micro_batch_size * train.num_accumulation_steps * n_gpus
train.train_epoch = 800
train.warmup_ratio = 40 / 800
train.log_period = 20
train.checkpointer.save_model_after_n_epoch = 20
# enable activation checkpointing
# train.activation_checkpoint.enabled = True
# set rdma enabled when num nodes > 1
# train.rdma_enabled = False
# Base learning in MAE is set to 1.5e-4
# The actually learning rate should be computed by linear scaling rule as follows:
# lr = base_lr * batch_size / 256
# In LiBai, you should refine the actually learning rate due to your on settings
# Here we use 8 GPUs, 128 batch_size per GPU for training, batch_size equals to 1024
base_lr = 1.5e-4
actual_lr = base_lr * effective_batch_size / 256
# Refine optim settings
optim.params._target_ = param_groups_weight_decay
optim.params.weight_decay = 0.05
optim.lr = actual_lr
optim.betas = (0.9, 0.95)
del optim.params.clip_grad_max_norm
del optim.params.clip_grad_norm_type
del optim.params.weight_decay_norm
del optim.params.weight_decay_bias
del optim.weight_decay
# Refine scheduler
# Default scheduler in LiBai training config is WarmupCosineLR
train.scheduler = LazyCall(warmup_cosine_lr_scheduler)(
warmup_factor=0.0,
min_lr=0.0,
)
# AMP
train.amp.enabled = True
# Distributed Settings
train.dist.data_parallel_size = n_gpus
train.dist.tensor_parallel_size = 1
train.dist.pipeline_parallel_size = 1
# train.dist.pipeline_num_layers = model.depth
from functools import partial
from libai.config import LazyCall
from libai.layers import LayerNorm
from modeling.mae import MaskedAutoencoderViT
model = LazyCall(MaskedAutoencoderViT)(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
decoder_embed_dim=512,
decoder_depth=8,
decoder_num_heads=16,
mlp_ratio=4,
norm_layer=partial(LayerNorm, eps=1e-6),
norm_pix_loss=True,
mask_ratio=0.75,
)
from .mae_vit_base_patch16 import model
model.patch_size = 14
model.embed_dim = 1280
model.depth = 32
model.num_heads = 16
from .mae_vit_base_patch16 import model
model.embed_dim = 1024
model.depth = 24
model.num_heads = 16
from libai.config import LazyCall
from modeling.vit import VisionTransformer
model = LazyCall(VisionTransformer)(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
drop_path_rate=0.1,
global_pool=True,
)
from .vit_base_patch16 import model
model.patch_size = 14
model.embed_dim = 1280
model.depth = 32
model.num_heads = 16
model.drop_path_rate = 0.2
from .vit_base_patch16 import model
model.embed_dim = 1024
model.depth = 24
model.num_heads = 16
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from libai.data.datasets.imagenet import ImageNetDataset
from libai.data.structures import Instance
class PretrainingImageNetDataset(ImageNetDataset):
"""ImageNet Dataset in LiBai for Pretraining
Return:
images: ImageNet train set images
"""
def __getitem__(self, index: int):
data_sample = super().__getitem__(index)
return Instance(images=data_sample.get("images"))
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
import oneflow.nn as nn
class SoftTargetCrossEntropy(nn.Module):
def __init__(self):
super(SoftTargetCrossEntropy, self).__init__()
def forward(self, x: flow.Tensor, target: flow.Tensor) -> flow.Tensor:
pred = flow.log_softmax(x, dim=-1)
loss = -target * pred
# sum and mean should be calculated with float32
# amp_white_identity ensure -target * pred using float16
# amp_black_identity ensure sum and mean using float32
loss = flow._C.amp_white_identity(loss)
loss = flow._C.amp_black_identity(loss)
loss = flow.sum(loss, dim=-1)
return loss.mean()
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------
# MAE Model
# References:
# mae: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------
import oneflow as flow
import oneflow.nn as nn
import libai.utils.distributed as dist
from libai.config import configurable
from libai.layers import LayerNorm, Linear, PatchEmbedding, TransformerLayer
from .pos_embed import get_2d_sincos_pos_embed
class MaskedAutoencoderViT(nn.Module):
"""Masked Autoencoder with VisionTransformer backbone"""
@configurable
def __init__(
self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=1024,
depth=24,
num_heads=16,
decoder_embed_dim=512,
decoder_depth=8,
decoder_num_heads=16,
mlp_ratio=4.0,
norm_layer=LayerNorm,
norm_pix_loss=False,
mask_ratio=0.75,
):
super().__init__()
self.mask_ratio = mask_ratio
# --------------------------------------------------------------------------
# MAE encoder specifics
self.patch_embed = PatchEmbedding(img_size, patch_size, in_chans, embed_dim)
num_patches = self.patch_embed.num_patches
self.cls_token = nn.Parameter(
flow.zeros(
1,
1,
embed_dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(0),
)
)
self.pos_embed = nn.Parameter(
flow.zeros(
1,
num_patches + 1,
embed_dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(0),
)
)
self.blocks = nn.ModuleList(
[
TransformerLayer(
hidden_size=embed_dim,
ffn_hidden_size=int(embed_dim * mlp_ratio),
num_attention_heads=num_heads,
layer_idx=i,
)
for i in range(depth)
]
)
# TODO: set norm layer placement stage id
self.norm = norm_layer(embed_dim, layer_idx=depth)
# --------------------------------------------------------------------------
# --------------------------------------------------------------------------
# MAE decoder specifics
self.decoder_embed = Linear(embed_dim, decoder_embed_dim, bias=True, layer_idx=depth)
self.mask_token = nn.Parameter(
flow.zeros(
1,
1,
decoder_embed_dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(depth),
)
)
self.decoder_pos_embed = nn.Parameter(
flow.zeros(
1,
num_patches + 1,
decoder_embed_dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(depth),
)
)
self.decoder_blocks = nn.ModuleList(
[
TransformerLayer(
hidden_size=decoder_embed_dim,
ffn_hidden_size=int(decoder_embed_dim * mlp_ratio),
num_attention_heads=decoder_num_heads,
layer_idx=(i + depth),
)
for i in range(decoder_depth)
]
)
self.decoder_norm = norm_layer(decoder_embed_dim, layer_idx=-1)
self.decoder_pred = Linear(
decoder_embed_dim, patch_size ** 2 * in_chans, bias=True, layer_idx=-1
) # decoder to patch
# --------------------------------------------------------------------------
self.norm_pix_loss = norm_pix_loss
self.initialize_weights()
def initialize_weights(self):
# initialization
# initialize (and freeze) pos_embed by sin-cos embedding
pos_embed = get_2d_sincos_pos_embed(
self.pos_embed.shape[-1], int(self.patch_embed.num_patches ** 0.5), cls_token=True
)
self.pos_embed.data.copy_(
flow.from_numpy(pos_embed)
.float()
.unsqueeze(0)
.to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=self.pos_embed.placement,
)
)
decoder_pos_embed = get_2d_sincos_pos_embed(
self.decoder_pos_embed.shape[-1],
int(self.patch_embed.num_patches ** 0.5),
cls_token=True,
)
self.decoder_pos_embed.data.copy_(
flow.from_numpy(decoder_pos_embed)
.float()
.unsqueeze(0)
.to_global(
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=self.decoder_pos_embed.placement,
)
)
# initialize patch_embed like nn.Linear (instead of nn.Conv2d)
w = self.patch_embed.proj.weight.data
flow.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
# timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
flow.nn.init.normal_(self.cls_token, std=0.02)
flow.nn.init.normal_(self.mask_token, std=0.02)
# initialize nn.Linear and nn.LayerNorm
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, Linear):
# we use xavier_uniform following official JAX ViT:
flow.nn.init.xavier_uniform_(m.weight)
if isinstance(m, Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
@classmethod
def from_config(cls, cfg):
return {
"img_size": cfg.img_size,
"patch_size": cfg.patch_size,
"in_chans": cfg.in_chans,
"embed_dim": cfg.embed_dim,
"depth": cfg.depth,
"num_heads": cfg.num_heads,
"decoder_embed_dim": cfg.decoder_embed_dim,
"decoder_depth": cfg.decoder_depth,
"decoder_num_heads": cfg.decoder_num_heads,
"mlp_ratio": cfg.mlp_ratio,
"norm_layer": cfg.norm_layer,
"norm_pix_loss": cfg.norm_pix_loss,
"mask_ratio": cfg.mask_ratio,
}
def patchify(self, imgs):
"""
imgs: (N, 3, H, W)
x: (N, L, patch_size**2 *3)
"""
p = self.patch_embed.patch_size[0]
assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
h = w = imgs.shape[2] // p
x = imgs.reshape(imgs.shape[0], 3, h, p, w, p)
# TODO: replace permute with flow.einsum
# (n c h p w q) -> (n h w p q c)
x = x.permute(0, 2, 4, 3, 5, 1)
x = x.reshape(imgs.shape[0], h * w, p ** 2 * 3)
return x
def unpatchify(self, x):
"""
x: (N, L, patch_size**2 *3)
imgs: (N, 3, H, W)
"""
p = self.patch_embed.patch_size[0]
h = w = int(x.shape[1] ** 0.5)
assert h * w == x.shape[1]
x = x.reshape(x.shape[0], h, w, p, p, 3)
# TODO: replace permute with flow.einsum
# (n h w p q c) -> (n c h p w q)
x = x.permute(0, 5, 1, 3, 2, 4)
imgs = x.reshape(x.shape[0], 3, h * p, h * p)
return imgs
def random_masking(self, x, mask_ratio):
"""
Perform per-sample random masking by per-sample shuffling.
Per-sample shuffling is done by argsort random noise.
x: [N, L, D], sequence
"""
N, L, D = x.shape
len_keep = int(L * (1 - mask_ratio))
noise = flow.rand(N, L, sbp=x.sbp, placement=x.placement) # noise in [0, 1]
# sort noise for each sample
ids_shuffle = flow.argsort(noise, dim=1) # ascend: small is keep, large is remove
ids_restore = flow.argsort(ids_shuffle, dim=1)
# keep the first subset
ids_keep = ids_shuffle[:, :len_keep]
x_masked = flow.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
# generate the binary mask: 0 is keep, 1 is remove
mask = flow.ones([N, L], sbp=x.sbp, placement=x.placement)
mask[:, :len_keep] = 0
# unshuffle to get binary mask
mask = flow.gather(mask, dim=1, index=ids_restore)
return x_masked, mask, ids_restore
def forward_encoder(self, x, mask_ratio):
# embed patches
x = self.patch_embed(x)
# add pos embed w/o cls token
x = x + self.pos_embed[:, 1:, :]
# masking: length -> length * mask_ratio
x, mask, ids_restore = self.random_masking(x, mask_ratio)
# append cls token
cls_token = self.cls_token + self.pos_embed[:, :1, :]
# Directly expanding cls_token (with shape=(1, 1, D) and sbp=B)
# will produce a huge tensor with shape [B*N, 1, D]
# (while B = local batch size, N = total num devices),
# however we only need an expanded cls_token with shape [B, 1, D],
# meanwhile local to global tensor is not avaible in graph mode for now,
# we have to use a two stage expanding way to expand cls_token as below.
world_size = flow.env.get_world_size()
# repeat to (N, 1, D), sbp = B
cls_token = cls_token.expand(world_size, -1, -1)
# to_global(sbp=S(0)), local shape = (1, 1, D)
cls_token = cls_token.to_global(sbp=x.sbp)
# second expand from (N, 1, D) to (B*N, 1, D)
# (global shape, sbp=S(0)), local shape=(B, 1, D),
# by this way we wouldn't produce a (B*N, 1, D) tensor in local view.
cls_tokens = cls_token.repeat(x.shape[0] // world_size, 1, 1)
x = flow.cat((cls_tokens, x), dim=1)
# apply Transformer blocks
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
return x, mask, ids_restore
def forward_decoder(self, x, ids_restore):
# embed tokens
x = self.decoder_embed(x)
# append mask tokens to sequence
# mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1)
# The line above will produce a huge mask_tokens with shape [B*N, L, D]
# (while B = local batch size, N = total num devices),
# actually we only need a mask_tokens with shape [B, L, D] in local view,
# meanwhile local to global tensor is not avaible in graph mode for now,
# we have to use a two stage repeat way as below.
world_size = flow.env.get_world_size()
# repeat to (N, 1, D), sbp = B
mask_token = self.mask_token.repeat(world_size, 1, 1)
# to_global(sbp=S(0)), local shape = (1, 1, D)
mask_token = mask_token.to_global(sbp=x.sbp)
# second repeat from (N, 1, D) to (B*N, L, D)
# (global shape, sbp=S(0)), local shape = (B, L, D),
# and the originally huge mask_tokens with shape (B*N, L, D)
# wouldn't be produced in local view.
mask_tokens = mask_token.repeat(
x.shape[0] // world_size, ids_restore.shape[1] + 1 - x.shape[1], 1
)
x_ = flow.cat([x[:, 1:, :], mask_tokens], dim=1) # no cls token
x_ = flow.gather(
x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])
) # unshuffle
x = flow.cat([x[:, :1, :], x_], dim=1) # append cls token
# add pos embed
x = x + self.decoder_pos_embed
# apply Transformer blocks
for blk in self.decoder_blocks:
x = blk(x)
x = self.decoder_norm(x)
# predictor projection
x = self.decoder_pred(x)
# remove cls token
x = x[:, 1:, :]
return x
def forward_loss(self, imgs, pred, mask):
"""
imgs: [N, 3, H, W]
pred: [N, L, p*p*3]
mask: [N, L], 0 is keep, 1 is remove,
"""
target = self.patchify(imgs)
if self.norm_pix_loss:
mean = target.mean(dim=-1, keepdim=True)
var = target.var(dim=-1, keepdim=True)
target = (target - mean) / (var + 1.0e-6) ** 0.5
loss = (pred - target) ** 2
# We want the prev loss to be calculated with float16,
# and mean/sum below to be calculated with float32.
# this amp_white_identity will affect preceding ops to be float16
loss = flow._C.amp_white_identity(loss)
# this amp_black_identity will affect succeeding ops to be float32
loss = flow._C.amp_black_identity(loss)
loss = loss.mean(dim=-1) # [N, L], mean loss per patch
loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches
return loss
def forward(self, images):
latent, mask, ids_restore = self.forward_encoder(images, self.mask_ratio)
pred = self.forward_decoder(latent, ids_restore) # [N, L, p*p*3]
loss = self.forward_loss(images, pred, mask)
if self.training:
return {"losses": loss}
else:
return {
"losses": loss,
"pred": pred,
"mask": mask,
}
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
# --------------------------------------------------------
# 2D sine-cosine position embedding
# References:
# MoCo v3: https://github.com/facebookresearch/moco-v3
# --------------------------------------------------------
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
"""
Arguments:
embed_dim: hidden_size of the input tokens
grid_size: int of the grid height and width
cls_token: with cls_token or not
Return:
pos_embed: [grid_size*grid_size, embed_dim]
or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
grid_h = np.arange(grid_size, dtype=np.float32)
grid_w = np.arange(grid_size, dtype=np.float32)
grid = np.meshgrid(grid_w, grid_h) # here w goes first
grid = np.stack(grid, axis=0)
grid = grid.reshape([2, 1, grid_size, grid_size])
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
if cls_token:
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
return pos_embed
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
assert embed_dim % 2 == 0
# use half of dimensions to encode grid_h
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
return emb
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,)
out: (M, D)
"""
assert embed_dim % 2 == 0
omega = np.arange(embed_dim // 2, dtype=np.float)
omega /= embed_dim / 2.0
omega = 1.0 / 10000 ** omega # (D/2,)
pos = pos.reshape(-1) # (M,)
out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
emb_sin = np.sin(out) # (M, D/2)
emb_cos = np.cos(out) # (M, D/2)
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
return emb
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------
# ViT Model
# References:
# mae: https://github.com/facebookresearch/mae/blob/main/models_vit.py
# --------------------------------------------------------
import oneflow as flow
import libai.models.vision_transformer
class VisionTransformer(libai.models.vision_transformer.VisionTransformer):
"""Vision Transformer for MAE
LiBai impl of: `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
- https://arxiv.org/abs/2010.11929
"""
def __init__(
self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4.0,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.0,
global_pool=False,
num_classes=1000,
loss_func=None,
):
super(VisionTransformer, self).__init__(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
depth=depth,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
drop_rate=drop_rate,
attn_drop_rate=attn_drop_rate,
drop_path_rate=drop_path_rate,
num_classes=num_classes,
loss_func=loss_func,
)
self.global_pool = global_pool
def no_weight_decay(self):
return {"pos_embed", "cls_token"}
def forward_head(self, x):
if self.global_pool:
x = x[:, 1:, :] # global pool without cls token
# we want mean to be calculated with float32
# the amp_white_identity pair make the calculation before and after mean using float16
# the amp_black_identity pair make mean using float32
x = flow._C.amp_white_identity(x)
x = flow._C.amp_black_identity(x)
x = x.mean(dim=1)
x = flow._C.amp_black_identity(x)
x = flow._C.amp_white_identity(x)
outcome = self.norm(x)
outcome = self.head(outcome)
else:
x = self.norm(x)
outcome = x[:, 0]
outcome = self.head(outcome)
return outcome
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import random
import sys
import numpy as np
import oneflow as flow
from utils.weight_convert import load_torch_checkpoint
from libai.config import LazyConfig, default_argument_parser, try_get_key
from libai.engine import DefaultTrainer, default_setup
from libai.utils.checkpoint import Checkpointer
sys.path.append(".")
logger = logging.getLogger("libai.mae." + __name__)
class Trainer(DefaultTrainer):
@classmethod
def build_model(cls, cfg):
model = super().build_model(cfg)
if try_get_key(cfg, "finetune") is not None:
if cfg.finetune.enable is True:
logger.info("Loading pretrained weight for finetuning")
assert cfg.finetune.weight_style in ["oneflow", "pytorch"]
if cfg.finetune.weight_style == "oneflow":
Checkpointer(model).load(cfg.finetune.path)
elif cfg.finetune.weight_style == "pytorch":
model = load_torch_checkpoint(model, cfg, path=cfg.finetune.path, strict=False)
else:
raise NotImplementedError(
"Only support loading oneflow & pytorch pretrained weight now."
)
return model
def main(args):
cfg = LazyConfig.load(args.config_file)
cfg = LazyConfig.apply_overrides(cfg, args.opts)
default_setup(cfg, args)
if args.fast_dev_run:
cfg.train.train_epoch = 0
cfg.train.checkpointer.period = 5
cfg.train.train_iter = 10
cfg.train.evaluation.eval_period = 10
cfg.train.log_period = 1
if args.eval_only:
cfg.eval_only = True
tokenizer = None
if try_get_key(cfg, "tokenization.setup", default=False):
tokenizer = Trainer.build_tokenizer(cfg)
model = Trainer.build_model(cfg)
Checkpointer(model, save_dir=cfg.train.output_dir).resume_or_load(
cfg.train.load_weight, resume=args.resume
)
if try_get_key(cfg, "train.graph.enabled", default=False):
model = Trainer.build_graph(cfg, model, is_train=False)
test_loader = Trainer.build_test_loader(cfg, tokenizer)
if len(test_loader) == 0:
logger.info("No dataset in dataloader.test, please set dataset for dataloader.test")
_ = Trainer.test(cfg, test_loader, model)
return
# manual different seed for each rank
seed_for_rank = cfg.train.seed + flow.env.get_rank()
flow.manual_seed(seed_for_rank)
flow.cuda.manual_seed(seed_for_rank)
np.random.seed(seed_for_rank)
random.seed(seed_for_rank)
trainer = Trainer(cfg)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
main(args)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------
# References:
# mae: https://github.com/facebookresearch/mae/blob/main/util/lr_decay.py
# --------------------------------------------------------
import logging
logger = logging.getLogger("libai.mae." + __name__)
def param_groups_lrd(model, weight_decay=0.05, layer_decay=0.75):
"""
Parameter groups for layer-wise lr decay
Modified from BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58
"""
param_group_names = {}
param_groups = {}
no_weight_decay_list = model.no_weight_decay()
num_layers = len(model.blocks) + 1
layer_scales = list(layer_decay ** (num_layers - i) for i in range(num_layers + 1))
for name, param in model.named_parameters():
if not param.requires_grad:
continue
if param.ndim == 1 or name in no_weight_decay_list:
g_decay = "no_decay"
this_decay = 0.0
else:
g_decay = "decay"
this_decay = weight_decay
layer_idx = get_layer_idx_for_vit(name, num_layers)
group_name = "layer_%d_%s" % (layer_idx, g_decay)
# logger.info(
# f"{name}, shape={param.shape}, {g_decay}={this_decay}"
# f", layer_scale={layer_scales[layer_idx]}"
# )
if group_name not in param_group_names:
this_scale = layer_scales[layer_idx]
param_group_names[group_name] = {
"lr_scale": this_scale,
"weight_decay": this_decay,
"params": [],
}
param_groups[group_name] = {
"lr_scale": this_scale,
"weight_decay": this_decay,
"params": [],
}
param_group_names[group_name]["params"].append(name)
param_groups[group_name]["params"].append(param)
return list(param_groups.values())
def get_layer_idx_for_vit(name, num_layers):
"""
Assign a parameter with its layer id
Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
"""
if name in ["cls_token", "pos_embed"]:
return 0
elif name.startswith("patch_embed"):
return 0
elif name.startswith("blocks"):
return int(name.split(".")[1]) + 1
else:
return num_layers
# Refer to: add_weight_decay in
# https://github.com/rwightman/pytorch-image-models/blob/v0.3.3/timm/optim/optim_factory.py
def param_groups_weight_decay(model, weight_decay=1e-5, skip_list=()):
decay_params = []
no_decay_params = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue # frozen weights
if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
no_decay_params.append(param)
else:
decay_params.append(param)
return [
{"params": no_decay_params, "weight_decay": 0.0},
{"params": decay_params, "weight_decay": weight_decay},
]
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import math
import oneflow as flow
from oneflow.optim.lr_scheduler import _LRScheduler
logger = logging.getLogger(__name__)
class LayerScaleWarmupCosineDecayLR(_LRScheduler):
def __init__(
self,
optimizer: flow.optim.Optimizer,
steps: int,
warmup_steps: int,
warmup_factor: float,
min_lr: float = 0.0,
last_step: int = -1,
verbose: bool = False,
):
self.total_steps = steps
self.decay_steps = steps - warmup_steps
self.warmup_steps = warmup_steps
self.warmup_factor = warmup_factor
self.min_lr = min_lr
super().__init__(optimizer, last_step, verbose)
def get_lr(self, base_lr, step):
if step < self.warmup_steps:
progress = step / self.warmup_steps
lr = base_lr * progress
elif step < self.total_steps:
progress = (step - self.warmup_steps) / self.decay_steps
lr = self.min_lr + (base_lr - self.min_lr) * 0.5 * (1.0 + math.cos(math.pi * progress))
else:
lr = self.min_lr
return lr
def update_lrs(self, lrs):
self._last_lr = []
for i, (group, lr) in enumerate(zip(self.optimizer.param_groups, lrs)):
if "lr_scale" in group:
group["lr"] = lr * group["lr_scale"]
else:
group["lr"] = lr
self._last_lr.append(lr)
if self.verbose:
self.print_lr(i, lr)
def warmup_layerscale_cosine_lr_scheduler(
optimizer: flow.optim.Optimizer,
max_iter: int,
warmup_iter: int,
warmup_factor: float,
min_lr: float = 0.0,
):
return LayerScaleWarmupCosineDecayLR(
optimizer,
steps=max_iter,
warmup_steps=warmup_iter,
warmup_factor=warmup_factor,
min_lr=min_lr,
)
def warmup_cosine_lr_scheduler(
optimizer: flow.optim.Optimizer,
max_iter: int,
warmup_iter: int,
warmup_factor: float = 0.0,
warmup_method: str = "linear",
min_lr: float = 0.0,
):
cosine_lr = flow.optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=max_iter - warmup_iter, eta_min=min_lr
)
if warmup_iter == 0:
logger.warning("warmup iters equals to zero, return CosineLR")
return cosine_lr
if warmup_iter > max_iter:
logger.warning("warmup iters is larger than the total training iters")
warmup_cosine_lr = flow.optim.lr_scheduler.WarmupLR(
cosine_lr,
warmup_factor=warmup_factor,
warmup_iters=warmup_iter,
warmup_method=warmup_method,
)
return warmup_cosine_lr
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import oneflow as flow
import torch
from flowvision.layers.weight_init import trunc_normal_
logger = logging.getLogger("libai.mae." + __name__)
def convert_qkv_weight(cfg, value):
"""
Convert qkv.weight to be compatible with LiBai transformer layer
Args:
cfg: config file
value: qkv.weight in the loaded checkpoint
"""
num_heads = cfg.model.num_heads
hidden_size = cfg.model.embed_dim
head_size = int(hidden_size / num_heads)
qkv_weight = (
value.view([3, num_heads, head_size, hidden_size])
.permute(1, 0, 2, 3)
.contiguous()
.view(hidden_size * 3, hidden_size)
)
return qkv_weight
def convert_qkv_bias(cfg, value):
"""
Convert qkv.bias to be compatible with LiBai transformer layer
Args:
cfg: config file
value: qkv.bias in the loaded checkpoint
"""
num_heads = cfg.model.num_heads
hidden_size = cfg.model.embed_dim
head_size = int(hidden_size / num_heads)
qkv_bias = (
value.view(3, num_heads, head_size).permute(1, 0, 2).contiguous().view(hidden_size * 3)
)
return qkv_bias
def filter_keys(key, value, cfg):
"""
Filtering the state_dict keys and values to match LiBai's MAE model
"""
if key.startswith("decoder_"):
value = None
elif "norm1" in key:
key = key.replace("norm1", "input_layernorm")
elif "attn.qkv" in key:
key = key.replace("attn.qkv", "self_attention.query_key_value")
if "weight" in key:
value = convert_qkv_weight(cfg, value)
if "bias" in key:
value = convert_qkv_bias(cfg, value)
elif "attn.proj" in key:
key = key.replace("attn.proj", "self_attention.dense")
elif "norm2" in key:
key = key.replace("norm2", "post_attention_layernorm")
elif "mlp.fc1" in key:
key = key.replace("mlp.fc1", "mlp.dense_h_to_4h")
elif "mlp.fc2" in key:
key = key.replace("mlp.fc2", "mlp.dense_4h_to_h")
elif "fc_norm" in key:
key = key.replace("fc_norm", "norm")
elif key == "norm.weight" or key == "norm.bias":
value = None
return key, value
def log_param(key, value):
logger.info(f"{key}, shape={value.shape}")
def load_torch_checkpoint(model, cfg, path="./mae_finetuned_vit_base.pth", strict=False):
"""
Load checkpoint from the given torch weights.
Torch weight can be downloaded from the original repo:
https://github.com/facebookresearch/mae
"""
torch_dict = torch.load(path, map_location="cpu")["model"]
parameters = torch_dict
new_parameters = dict()
for key, value in parameters.items():
# log_param(key, value)
if "num_batches_tracked" not in key:
# to global tensor
key, val = filter_keys(key, value, cfg)
if val is None:
continue
val = val.detach().cpu().numpy()
val = flow.tensor(val).to_global(
sbp=flow.sbp.broadcast, placement=flow.placement("cuda", ranks=[0])
)
new_parameters[key] = val
msg = model.load_state_dict(new_parameters, strict=strict)
logger.info(msg)
if not cfg.eval_only:
trunc_normal_(model.head.weight, std=2e-5)
logger.info("Successfully load torch mae checkpoint.")
return model
## MOCOv3 in LiBai
**An Empirical Study of Training Self-Supervised Vision Transformers**
Xinlei Chen, Saining Xie, Kaiming He
[[`arXiv`](https://arxiv.org/abs/2104.02057)] [[`BibTeX`](#Citation)]
<p align="center">
<img src="https://user-images.githubusercontent.com/34954782/161363870-eb672518-deee-4754-b30f-be59ea91ac7e.png" width="480">
</p>
This is the OneFlow re-implementation of MOCOv3 based on [LiBai](https://libai.readthedocs.io/).
## Catelog
- [x] MOCOv3 pretraining code
- [x] MOCOv3 linear prob code
## Supported parallel mode and task
Based on [libai.layers](https://libai.readthedocs.io/en/latest/modules/libai.layers.html), MOCOv3 model is automatically configured with the following parallelism mode.
<table class="docutils">
<tbody>
<tr>
<th width="80"> Model </th>
<th valign="bottom" align="left" width="120">Data Parallel</th>
<th valign="bottom" align="left" width="120">Tensor Parallel</th>
<th valign="bottom" align="left" width="120">Pipeline Parallel</th>
</tr>
<tr>
<td align="left"> <b> MOCOv3 pretrain </b> </td>
<td align="left">&#10004;</td>
<td align="left">-</td>
<td align="left">-</td>
</tr>
<tr>
<td align="left"> <b> MOCOv3 linear prob </b> </td>
<td align="left">&#10004;</td>
<td align="left">&#10004;</td>
<td align="left">&#10004;</td>
</tr>
</tbody>
</table>
## Usage
### Installation
Please see [LiBai Installation](https://libai.readthedocs.io/en/latest/tutorials/get_started/Installation.html) to install LiBai
### Prepare the Data
Please see [Prepare the Data](https://libai.readthedocs.io/en/latest/tutorials/get_started/quick_run.html#prepare-the-data).
### Pretraining
Pretraining MOCOv3 on 8 GPUs using data parallelism.
```bash
cd /path/to/libai
bash tools/train.sh projects/MOCOV3/pretrain_net.py projects/MOCOV3/configs/moco_pretrain.py 8
```
### Linear Prob
1. Setup the weights for linear prob in [moco_linear_prob.py](./configs/moco_linear_prob.py) as follows:
```python
# moco_linear_prob.py
# Path to the weight for linear prob
model.linear_prob = "path/to/pretrained_weight"
model.weight_style = "oneflow"
```
If you feel confused about the checkpoint format here, please refer to [Load and Save a Checkpoint in LiBai](https://libai.readthedocs.io/en/latest/tutorials/basics/Load_and_Save_Checkpoint.html) for more details.
2. The MOCOv3 linear prob on 8 GPUs using data parallelism.
```bash
cd /path/to/libai
bash tools/train.sh tools/train_net.py projects/MOCOV3/configs/moco_linear_prob.py 8
```
**Notes:** if you want to run the MOCOv3 linear prob models using different parallel strategies, please refer to the [Distributed Configuration Tutorial](https://libai.readthedocs.io/en/latest/tutorials/basics/Distributed_Configuration.html)
### Evaluation
Evaluate MOCOv3 model under LiBai on 8 GPUs:
```bash
cd /path/to/libai
bash tools/train.sh tools/train_net.py projects/MOCOV3/configs/moco_linear_prob.py 8 --eval-only train.load_weight="path/to/pretrained_weight"
```
## Advanced Usage
### The MOCOv3 linear prob with pytorch pretrained checkpoint
You can download pytorch pretrained weight from [MOCOv3 official repo](https://github.com/facebookresearch/moco-v3/blob/main/CONFIG.md) and run linear prob in LiBai by updating the [moco_linear_prob.py](./configs/moco_linear_prob.py) as follows:
```python
# Path to the weight for linear prob
model.linear_prob = "/path/to/vit-s-300ep.pth.tar"
model.weight_style = "pytorch"
```
Run linear prob on 8 GPUs:
```bash
cd /path/to/libai
bash tools/train.sh tools/train_net.py projects/MOCOV3/configs/moco_linear_prob.py 8
```
## Citation
```BibTeX
@inproceedings{chen2021empirical,
title={An empirical study of training self-supervised vision transformers},
author={Chen, Xinlei and Xie, Saining and He, Kaiming},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
pages={9640--9649},
year={2021}
}
```
\ No newline at end of file
from oneflow.optim import SGD
from flowvision.transforms import transforms
from libai.config import get_config, LazyCall
from .models.vit_small_patch16 import model
from ..transform.linear_prob_transform import train_aug
dataloader = get_config("common/data/imagenet.py").dataloader
train = get_config("common/train.py").train
graph = get_config("common/models/graph.py").graph
optim = get_config("common/optim.py").optim
# Path to the weight for fine-tune
model.linear_prob = "path/to/pretrained_weight"
model.weight_style = "oneflow"
# Refine data path to imagenet
dataloader.train.dataset[0].root = "/path/to/imagenet/"
dataloader.test[0].dataset.root = "/path/to/imagenet/"
# Add augmentation Func
dataloader.train.dataset[0].transform = LazyCall(transforms.Compose)(transforms=train_aug)
# Refine train cfg for moco v3 model
train.train_micro_batch_size = 128
train.test_micro_batch_size = 32
train.train_epoch = 90
train.log_period = 1
train.evaluation.eval_period = 1000
optim._target_ = SGD
optim.params.clip_grad_max_norm = None
optim.params.clip_grad_norm_type = None
optim.params.weight_decay_norm = None
optim.params.weight_decay_bias = None
del optim.betas
del optim.eps
del optim.do_bias_correction
# Refine optimizer cfg for moco v3 model
# Reference:
# https://github.com/facebookresearch/moco-v3/blob/main/CONFIG.md
# https://github.com/facebookresearch/moco-v3/blob/main/main_lincls.py
base_lr = 3.0
actual_lr = base_lr * (train.train_micro_batch_size * 8 / 256)
optim.lr = actual_lr
optim.weight_decay = 0.0
optim.momentum = 0.9
# Scheduler
train.scheduler.warmup_iter = 0
train.scheduler.alpha = 0
graph.enabled = False
from flowvision import transforms
from libai.config import get_config, LazyCall
from .models.moco_vit_small_patch16 import model
from transform.pretrain_transform import TwoCropsTransform, augmentation1, augmentation2
dataloader = get_config("common/data/imagenet.py").dataloader
train = get_config("common/train.py").train
graph = get_config("common/models/graph.py").graph
optim = get_config("common/optim.py").optim
# Refine data path to imagenet
dataloader.train.dataset[0].root = "/path/to/imagenet/"
dataloader.test[0].dataset.root = "/path/to/imagenet/"
# Add augmentation Func
dataloader.train.dataset[0].transform = LazyCall(TwoCropsTransform)(
base_transform1=LazyCall(transforms.Compose)(transforms=augmentation1),
base_transform2=LazyCall(transforms.Compose)(transforms=augmentation2),
)
# the momentum of MOCOV3
model.m = 0.99
# the temperature coefficient of MOCOV3
model.T = 0.2
# Refine train cfg for moco v3 model
train.train_micro_batch_size = 32
train.test_micro_batch_size = 32
train.train_epoch = 300
train.warmup_ratio = 40 / 300
train.eval_period = 5
train.log_period = 1
# Refine optimizer cfg for moco v3 model
base_lr = 1.5e-4
actual_lr = base_lr * (train.train_micro_batch_size * 8 / 256)
optim.lr = actual_lr
optim.weight_decay = 0.1
# Scheduler
train.scheduler.warmup_factor = 0.001
train.scheduler.alpha = 1.5e-4
train.scheduler.warmup_method = "linear"
graph.enabled = False
from libai.config import LazyCall
from modeling.moco import MoCo_ViT
from modeling.vit import VisionTransformer
base_encoder = LazyCall(VisionTransformer)(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
drop_path_rate=0.1,
global_pool=False,
stop_grad_conv1=True,
)
momentum_encoder = LazyCall(VisionTransformer)(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
drop_path_rate=0.1,
global_pool=False,
stop_grad_conv1=True,
)
model = LazyCall(MoCo_ViT)(
base_encoder=base_encoder,
momentum_encoder=momentum_encoder,
dim=256,
mlp_dim=4096,
T=0.2,
m=0.99,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment