"tasks/knwl_dialo/preprocessing.py" did not exist on "b1a6d73b3a34a5e53db9272aec582c407bb2905b"
Commit 73557d95 authored by yuguo960516's avatar yuguo960516
Browse files

glm

parents
Pipeline #148 failed with stages
in 0 seconds
from libai.config import LazyCall
from modeling.moco import MoCo_ViT
from modeling.vit import VisionTransformer
base_encoder = LazyCall(VisionTransformer)(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
drop_path_rate=0.1,
global_pool=False,
stop_grad_conv1=True,
)
momentum_encoder = LazyCall(VisionTransformer)(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
drop_path_rate=0.1,
global_pool=False,
stop_grad_conv1=True,
)
model = LazyCall(MoCo_ViT)(
base_encoder=base_encoder,
momentum_encoder=momentum_encoder,
dim=256,
mlp_dim=4096,
T=0.2,
m=0.99,
)
from libai.config import LazyCall
from modeling.moco import MoCo_ViT
from modeling.vit import VisionTransformer
base_encoder = LazyCall(VisionTransformer)(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=384,
depth=12,
num_heads=12,
mlp_ratio=4,
drop_path_rate=0.0,
global_pool=False,
stop_grad_conv1=True,
)
momentum_encoder = LazyCall(VisionTransformer)(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=384,
depth=12,
num_heads=12,
mlp_ratio=4,
drop_path_rate=0.0,
global_pool=False,
stop_grad_conv1=True,
)
model = LazyCall(MoCo_ViT)(
base_encoder=base_encoder,
momentum_encoder=momentum_encoder,
dim=256,
mlp_dim=4096,
T=0.2,
m=0.99,
)
import sys
sys.path.append("projects/MOCOV3")
from libai.config import LazyCall # noqa: E402
from modeling.vit import VisionTransformer # noqa: E402
model = LazyCall(VisionTransformer)(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
drop_path_rate=0.1,
global_pool=False,
)
from .vit_base_patch16 import model
model.embed_dim = 384
model.depth = 12
model.num_heads = 12
model.drop_path_rate = 0.0
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------
# MoCo v3 Model
# References:
# moco-v3: https://github.com/facebookresearch/moco-v3/blob/main/moco/builder.py
# --------------------------------------------------------
import math
import oneflow as flow
import oneflow.nn as nn
from libai.layers import Linear
from libai.utils.distributed import get_world_size
class MoCo(nn.Module):
"""
Build a MoCo model with a base encoder, a momentum encoder, and two MLPs
https://arxiv.org/abs/1911.05722
"""
def __init__(
self, base_encoder, momentum_encoder, dim=256, mlp_dim=4096, T=1.0, m=0.99, max_iter=300
):
"""
dim: feature dimension (default: 256)
mlp_dim: hidden dimension in MLPs (default: 4096)
T: softmax temperature (default: 1.0)
"""
super(MoCo, self).__init__()
self.T = T
self.m = m
# build encoders
self.base_encoder = base_encoder
self.momentum_encoder = momentum_encoder
self.base_encoder.num_classes = dim
self.momentum_encoder.num_classes = dim
self.max_iter = max_iter
self._build_projector_and_predictor_mlps(dim, mlp_dim)
for param_b, param_m in zip(
self.base_encoder.parameters(), self.momentum_encoder.parameters()
):
param_m.data.copy_(param_b.data) # initialize
param_m.requires_grad = False # not update by gradient
def _build_mlp(self, num_layers, input_dim, mlp_dim, output_dim, last_bn=True):
mlp = []
for l in range(num_layers):
dim1 = input_dim if l == 0 else mlp_dim
dim2 = output_dim if l == num_layers - 1 else mlp_dim
mlp.append(Linear(dim1, dim2, bias=False)) # libai
if l < num_layers - 1:
mlp.append(nn.BatchNorm1d(dim2))
mlp.append(nn.ReLU(inplace=True))
elif last_bn:
# follow SimCLR's design:
# https://github.com/google-research/simclr/blob/master/model_util.py#L157
# for simplicity, we further removed gamma in BN
# TODO: affine should be False (bug here)
mlp.append(nn.BatchNorm1d(dim2, affine=True))
return nn.Sequential(*mlp)
def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
pass
@flow.no_grad()
def _update_momentum_encoder(self, m):
"""Momentum update of the momentum encoder"""
for param_b, param_m in zip(
self.base_encoder.parameters(), self.momentum_encoder.parameters()
):
param_m.data = param_m.data * m + param_b.data * (1.0 - m)
def contrastive_loss(self, q, k):
# normalize
q = nn.functional.normalize(q, dim=1)
k = nn.functional.normalize(k, dim=1)
# gather all targets
# k = concat_all_gather(k).to_global(sbp=q.sbp, placement=q.placement)
k = k.to_global(sbp=flow.sbp.broadcast)
# Einstein sum is more intuitive
logits = flow.einsum("nc,mc->nm", q, k) / self.T
N = logits.shape[0] // get_world_size()
labels = (flow.arange(N, dtype=flow.long) + N * flow.env.get_rank()).to_global(
sbp=flow.sbp.split(0), placement=logits.placement
)
return nn.CrossEntropyLoss()(logits, labels) * (2 * self.T)
def adjust_moco_momentum(self, cu_iter, m):
"""Adjust moco momentum based on current epoch"""
m = 1.0 - 0.5 * (1.0 + math.cos(math.pi * cu_iter / self.max_iter)) * (1.0 - m)
return m
def forward(self, images, labels=None, cu_iter=0, m=0.99):
if self.training:
[x1, x2] = flow.chunk(images, 2, dim=1)
# compute features
q1 = self.predictor(self.base_encoder(x1)["prediction_scores"])
q2 = self.predictor(self.base_encoder(x2)["prediction_scores"])
m = self.adjust_moco_momentum(cu_iter, m) # update the moco_momentum
with flow.no_grad(): # no gradient
self._update_momentum_encoder(m) # update the momentum encoder
# compute momentum features as targets
k1 = self.momentum_encoder(x1)["prediction_scores"]
k2 = self.momentum_encoder(x2)["prediction_scores"]
return (
{"losses": self.contrastive_loss(q1, k2) + self.contrastive_loss(q2, k1)},
{"m": m},
)
else:
return self.base_encoder(images)
class MoCo_ViT(MoCo):
def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
hidden_dim = self.base_encoder.head.weight.shape[1]
# projectors
self.base_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim, dim)
self.momentum_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim, dim)
# predictor
self.predictor = self._build_mlp(2, dim, mlp_dim, dim)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------
# ViT Model
# References:
# moco-v3: https://github.com/facebookresearch/moco-v3/blob/main/vits.py
# --------------------------------------------------------
import math
from functools import reduce
from operator import mul
import oneflow as flow
import oneflow.nn as nn
from flowvision.layers.weight_init import trunc_normal_
from utils.load_checkpoint import load_checkpoint
from libai.layers import Linear, PatchEmbedding
from libai.models import vision_transformer
class VisionTransformer(vision_transformer.VisionTransformer):
"""Vision Transformer for MOCO
LiBai impl of: `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
- https://arxiv.org/abs/2010.11929
"""
def __init__(
self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4.0,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.0,
global_pool=False,
num_classes=1000,
loss_func=None,
linear_prob=None,
weight_style="pytorch",
stop_grad_conv1=False,
):
super(VisionTransformer, self).__init__(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
depth=depth,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
drop_rate=drop_rate,
attn_drop_rate=attn_drop_rate,
drop_path_rate=drop_path_rate,
num_classes=num_classes,
loss_func=loss_func,
)
self.global_pool = global_pool
# weight init
if linear_prob:
load_checkpoint(self, linear_prob, weight_style, num_heads, embed_dim)
self.head.weight.data.normal_(mean=0.0, std=0.01)
self.head.bias.data.zeros_()
else:
trunc_normal_(self.pos_embed, std=0.02)
trunc_normal_(self.cls_token, std=0.02)
self.apply(self._init_weights)
self.stop_grad_conv1 = stop_grad_conv1
self.embed_dim = embed_dim
self.initialization()
def initialization(self):
# Use fixed 2D sin-cos position embedding
self.build_2d_sincos_position_embedding()
# weight initialization
for name, m in self.named_modules():
if isinstance(m, Linear):
if "query_key_value" in name:
val = math.sqrt(6.0 / float(m.weight.shape[0] // 3 + m.weight.shape[1]))
nn.init.uniform_(m.weight, -val, val)
else:
nn.init.xavier_uniform_(m.weight)
nn.init.zeros_(m.bias)
nn.init.normal_(self.cls_token, std=1e-6)
if isinstance(self.patch_embed, PatchEmbedding):
# xavier_uniform initialization
val = math.sqrt(
6.0 / float(3 * reduce(mul, self.patch_embed.patch_size, 1) + self.embed_dim)
)
nn.init.uniform_(self.patch_embed.proj.weight, -val, val)
nn.init.zeros_(self.patch_embed.proj.bias)
if self.stop_grad_conv1:
self.patch_embed.proj.weight.requires_grad = False
self.patch_embed.proj.bias.requires_grad = False
def build_2d_sincos_position_embedding(self, temperature=10000.0):
sbp = self.pos_embed.sbp
placement = self.pos_embed.placement
h, w = self.patch_embed.grid_size
grid_w = flow.arange(w, dtype=flow.float32).to_global(sbp=sbp, placement=placement)
grid_h = flow.arange(h, dtype=flow.float32).to_global(sbp=sbp, placement=placement)
grid_w, grid_h = flow.meshgrid(grid_w, grid_h)
assert (
self.embed_dim % 4 == 0
), "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
pos_dim = self.embed_dim // 4
omega = (flow.arange(pos_dim, dtype=flow.float32) / pos_dim).to_global(
sbp=sbp, placement=placement
)
omega = 1.0 / flow.tensor(temperature).to_global(sbp=sbp, placement=placement) ** omega
out_w = flow.einsum("m,d->md", grid_w.flatten(), omega)
out_h = flow.einsum("m,d->md", grid_h.flatten(), omega)
pos_emb = flow.cat(
[flow.sin(out_w), flow.cos(out_w), flow.sin(out_h), flow.cos(out_h)], dim=1
)[None, :, :]
pe_token = flow.zeros([1, 1, self.embed_dim], dtype=flow.float32).to_global(
sbp=sbp, placement=placement
)
self.pos_embed = nn.Parameter(flow.cat([pe_token, pos_emb], dim=1))
self.pos_embed.requires_grad = False
def forward_head(self, x):
if self.global_pool:
x = x[:, 1:, :].mean(dim=1) # global pool without cls token
outcome = self.norm(x)
outcome = self.head(outcome)
else:
x = self.norm(x)
outcome = x[:, 0]
outcome = self.head(outcome)
return outcome
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import sys
from trainer.moco_trainer import MoCoEagerTrainer
from libai.config import LazyConfig, default_argument_parser, try_get_key
from libai.engine import DefaultTrainer, default_setup
from libai.utils.checkpoint import Checkpointer
sys.path.append(".")
logger = logging.getLogger(__name__)
class MoCoPretrainingTrainer(DefaultTrainer):
def __init__(self, cfg):
super().__init__(cfg)
self.model.max_iter = cfg.train.train_iter
self._trainer = MoCoEagerTrainer(
self.model, self.train_loader, self.optimizer, cfg.train.num_accumulation_steps
)
def main(args):
cfg = LazyConfig.load(args.config_file)
cfg = LazyConfig.apply_overrides(cfg, args.opts)
if try_get_key(cfg, "graph.enabled") is True:
raise NotImplementedError(
"LiBai MOCO only support eager global mode now, please set cfg.graph.enabled=False"
)
default_setup(cfg, args)
if args.fast_dev_run:
cfg.train.train_epoch = 0
cfg.train.train_iter = 20
cfg.train.eval_period = 10
cfg.train.log_period = 1
if args.eval_only:
tokenizer = None
if try_get_key(cfg, "tokenization.setup", default=False):
tokenizer = MoCoPretrainingTrainer.build_tokenizer(cfg)
model = MoCoPretrainingTrainer.build_model(cfg)
Checkpointer(model, save_dir=cfg.train.output_dir).resume_or_load(
cfg.train.load_weight, resume=args.resume
)
if try_get_key(cfg, "train.graph.enabled", default=False):
model = MoCoPretrainingTrainer.build_graph(cfg, model, is_train=False)
test_loader = MoCoPretrainingTrainer.build_test_loader(cfg, tokenizer)
_ = MoCoPretrainingTrainer.test(cfg, test_loader, model)
return
trainer = MoCoPretrainingTrainer(cfg)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
main(args)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from typing import Callable
from libai.engine.trainer import EagerTrainer
class MoCoEagerTrainer(EagerTrainer):
def run_step(self, get_batch: Callable):
assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
start = time.perf_counter()
# If you want to do something with the data, you can wrap the dataloader.
data = next(self._data_loader_iter)
data = get_batch(data, getattr(self.data_loader, "mixup_func", None))
data_time = time.perf_counter() - start
# update the moco_momentum per step
loss_dict, m_dict = self.model(**data, cu_iter=self.iter, m=self.model.m)
self.model.m = m_dict["m"]
losses = sum(loss_dict.values()) / self.grad_acc_steps
losses.backward()
self.write_metrics(loss_dict, data_time)
if (self.iter + 1) % self.grad_acc_steps == 0:
self.optimizer.step()
self.optimizer.zero_grad()
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from flowvision import transforms
from flowvision.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from libai.config import LazyCall
train_aug = [
LazyCall(transforms.RandomResizedCrop)(size=224),
LazyCall(transforms.RandomHorizontalFlip)(),
LazyCall(transforms.ToTensor)(),
LazyCall(transforms.Normalize)(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
]
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import oneflow as flow
from flowvision import transforms
from flowvision.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from PIL import ImageFilter, ImageOps
from libai.config import LazyCall
class GaussianBlur(object):
"""Gaussian blur augmentation from SimCLR: https://arxiv.org/abs/2002.05709"""
def __init__(self, sigma=[0.1, 2.0]):
self.sigma = sigma
def __call__(self, x):
sigma = random.uniform(self.sigma[0], self.sigma[1])
x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
return x
class Solarize(object):
"""Solarize augmentation from BYOL: https://arxiv.org/abs/2006.07733"""
def __call__(self, x):
return ImageOps.solarize(x)
# follow BYOL's augmentation recipe: https://arxiv.org/abs/2006.07733
augmentation1 = [
LazyCall(transforms.RandomResizedCrop)(size=224, scale=(0.2, 1.0)),
LazyCall(transforms.RandomApply)(
transforms=[
LazyCall(transforms.ColorJitter)(
brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1
) # not strengthened
],
p=0.8,
),
# TODO: Add RandomGrayscale
# LazyCall(transforms.RandomGrayscale)(p=0.2),
LazyCall(transforms.RandomApply)(transforms=[LazyCall(GaussianBlur)(sigma=[0.1, 2.0])], p=1.0),
LazyCall(transforms.RandomHorizontalFlip)(),
LazyCall(transforms.ToTensor)(),
LazyCall(transforms.Normalize)(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
]
augmentation2 = [
LazyCall(transforms.RandomResizedCrop)(size=224, scale=(0.2, 1.0)),
LazyCall(transforms.RandomApply)(
transforms=[
LazyCall(transforms.ColorJitter)(
brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1
) # not strengthened
],
p=0.8,
),
# TODO: Add RandomGrayscale
# LazyCall(transforms.RandomGrayscale)(p=0.2),
LazyCall(transforms.RandomApply)(transforms=[LazyCall(GaussianBlur)(sigma=[0.1, 2.0])], p=1.0),
LazyCall(transforms.RandomApply)(transforms=[LazyCall(Solarize)()], p=0.2),
LazyCall(transforms.RandomHorizontalFlip)(),
LazyCall(transforms.ToTensor)(),
LazyCall(transforms.Normalize)(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
]
class TwoCropsTransform:
"""Take two random crops of one image"""
def __init__(self, base_transform1, base_transform2):
self.base_transform1 = base_transform1
self.base_transform2 = base_transform2
def __call__(self, x):
im1 = self.base_transform1(x)
im2 = self.base_transform2(x)
return flow.cat((im1, im2), dim=0)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from utils.weight_convert import load_torch_checkpoint_linear_prob
from libai.utils.checkpoint import (
Checkpointer,
get_missing_parameters_message,
get_unexpected_parameters_message,
)
logger = logging.getLogger("libai." + __name__)
def load_checkpoint(model, path, weight_style, num_heads, embed_dim):
linear_keyword = "head"
for name, param in model.named_parameters():
if name not in ["%s.weight" % linear_keyword, "%s.bias" % linear_keyword]:
param.requires_grad = False
assert weight_style in ["pytorch", "oneflow"]
if weight_style == "pytorch":
params = load_torch_checkpoint_linear_prob(num_heads, embed_dim, path=path)
else:
params = Checkpointer(model).load(path)
model_state_dict = model.state_dict()
# check the incorrect shape and unexpected keys
incorrect_shapes = []
unexpected_keys = []
for k in list(params.keys()):
if k in model_state_dict:
shape_model = tuple(model_state_dict[k].shape)
shape_ckp = tuple(params[k].shape)
if shape_model != shape_ckp:
incorrect_shapes.append((k, shape_ckp, shape_model))
params.pop(k)
model_state_dict.pop(k)
else:
unexpected_keys.append(k)
missing_keys = list(model_state_dict.keys())
for k, shape_checkpoint, shape_model in incorrect_shapes:
logger.warning(
"Skip loading parameter '{}' to the model due to incompatible "
"shapes: {} in the checkpoint but {} in the "
"model! You might want to double check if this is expected.".format(
k, shape_checkpoint, shape_model
)
)
if missing_keys:
logger.info(get_missing_parameters_message(missing_keys))
if unexpected_keys:
logger.info(get_unexpected_parameters_message(unexpected_keys))
model.load_state_dict(params, strict=False)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import oneflow as flow
import torch
logger = logging.getLogger(__name__)
def convert_qkv_weight(value, num_heads, hidden_size):
"""
convert qkv.weight to be compatible with LiBai transformer layer
Args:
cfg: config file
value: qkv.weight in the loaded checkpoint
"""
head_size = int(hidden_size / num_heads)
qkv_weight = (
value.view(3, num_heads, head_size, hidden_size)
.permute(1, 0, 2, 3)
.contiguous()
.view(hidden_size * 3, hidden_size)
)
return qkv_weight
def convert_qkv_bias(value, num_heads, hidden_size):
"""
convert qkv.bias to be compatible with LiBai transformer layer
Args:
cfg: config file
value: qkv.bias in the loaded checkpoint
"""
head_size = int(hidden_size / num_heads)
qkv_bias = (
value.view(3, num_heads, head_size).permute(1, 0, 2).contiguous().view(hidden_size * 3)
)
return qkv_bias
def filter_keys(key, value, num_heads, hidden_size):
"""Filtering the state_dict keys and values to match LiBai's MOCOV3 model"""
if "norm1" in key:
key = key.replace("norm1", "input_layernorm")
elif "attn.qkv" in key:
key = key.replace("attn.qkv", "self_attention.query_key_value")
if "weight" in key:
value = convert_qkv_weight(value, num_heads, hidden_size)
if "bias" in key:
value = convert_qkv_bias(value, num_heads, hidden_size)
elif "attn.proj" in key:
key = key.replace("attn.proj", "self_attention.dense")
elif "norm2" in key:
key = key.replace("norm2", "post_attention_layernorm")
elif "mlp.fc1" in key:
key = key.replace("mlp.fc1", "mlp.dense_h_to_4h")
elif "mlp.fc2" in key:
key = key.replace("mlp.fc2", "mlp.dense_4h_to_h")
elif "fc_norm" in key:
key = key.replace("fc_norm", "norm")
return key, value
def load_torch_checkpoint_linear_prob(
num_heads, hidden_size, path="projects/MOCOV3/output/vit-b-300ep.pth.tar", linear_keyword="head"
):
"""Load checkpoint from the given torch weights.
Torch weight from: xxx
"""
torch_dict = torch.load(path, map_location="cpu")["state_dict"]
parameters = torch_dict
new_parameters = dict()
for key, value in parameters.items():
if "num_batches_tracked" not in key:
if key.startswith("module.base_encoder") and not key.startswith(
"module.base_encoder.%s" % linear_keyword
):
# to global tensor
key, val = filter_keys(key, value, num_heads, hidden_size)
val = val.detach().cpu().numpy()
val = flow.tensor(val).to_global(
sbp=flow.sbp.broadcast, placement=flow.placement("cuda", {0: range(1)})
)
new_parameters[key[len("module.base_encoder.") :]] = val
return new_parameters
from omegaconf import DictConfig
from libai.config import LazyCall
from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining
cfg = dict(
vocab_size=250112,
hidden_size=768,
hidden_layers=12,
num_attention_heads=12,
head_size=64,
intermediate_size=2048,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
embedding_dropout_prob=0.1,
relative_attention_num_buckets=32,
initializer_range=1.0,
layernorm_eps=1e-06,
amp_enabled=False,
model_type="mt5",
eos_token_id=1,
padding_idx=0,
is_encoder_decoder=True,
tie_word_embeddings=True,
)
cfg = DictConfig(cfg)
mt5_model = LazyCall(MT5Model)(cfg=cfg)
pretrain_model = LazyCall(MT5ForPreTraining)(cfg=cfg)
from omegaconf import DictConfig
from libai.config import LazyCall
from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining
cfg = dict(
vocab_size=250112,
hidden_size=1024,
hidden_layers=24,
num_attention_heads=16,
head_size=64,
intermediate_size=2816,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
embedding_dropout_prob=0.1,
relative_attention_num_buckets=32,
initializer_range=1.0,
layernorm_eps=1e-06,
amp_enabled=False,
model_type="mt5",
eos_token_id=1,
padding_idx=0,
is_encoder_decoder=True,
tie_word_embeddings=False,
)
cfg = DictConfig(cfg)
mt5_model = LazyCall(MT5Model)(cfg=cfg)
pretrain_model = LazyCall(MT5ForPreTraining)(cfg=cfg)
from libai.config import LazyCall
from libai.evaluation import PPLEvaluator
from libai.scheduler import WarmupExponentialLR
from configs.common.train import train
from configs.common.data.t5_dataset import dataloader, tokenization
from configs.common.models.graph import graph
from configs.common.optim import optim
from projects.MT5.configs.mt5_base import pretrain_model as model
vocab_file = "./data_test/bert_data/bert-base-chinese-vocab.txt"
data_prefix = "./data_test/bert_data/loss_compara_content_sentence"
tokenization.tokenizer.vocab_file = vocab_file
dataloader.train.dataset[0].data_prefix = data_prefix
dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix
# model config
model.cfg.hidden_size = 768
model.cfg.hidden_layers = 12
model.cfg.num_attention_heads = 12
model.cfg.head_size = 64
model.cfg.intermediate_size = 2048
model.cfg.model_type = "mt5"
model.cfg.hidden_dropout_prob = 0.0
model.cfg.attention_probs_dropout_prob = 0.0
model.cfg.embedding_dropout_prob = 0.0
model.cfg.vocab_size = 30522
model.cfg.padding_idx = 0
model.cfg.tie_word_embeddings = False
model.cfg.is_encoder_decoder = False
model.cfg.amp_enabled = True
model.cfg.initializer_range = 0.02
model.cfg.pretrained_model_path = None
train.update(
dict(
output_dir="projects/MT5/output/mt5_output",
train_micro_batch_size=4,
train_epoch=1,
train_iter=24000,
log_period=10,
amp=dict(enabled=True),
warmup_ratio=1 / 24,
# checkpointer=dict(period=10, max_to_keep=20),
input_placement_device="cpu",
dist=dict(
data_parallel_size=2,
tensor_parallel_size=2,
pipeline_parallel_size=1,
pipeline_num_layers=2 * model.cfg.hidden_layers,
),
scheduler=LazyCall(WarmupExponentialLR)(
warmup_factor=0.001,
gamma=1.0,
warmup_method="linear",
warmup_iter=0.0,
),
evaluation=dict(
evaluator=LazyCall(PPLEvaluator)(),
enabled=True,
eval_iter=1e5,
eval_period=5000,
),
)
)
train.zero_optimization.enabled = True
train.zero_optimization.stage = 2
train.activation_checkpoint.enabled = False
train.num_accumulation_steps = 8
from omegaconf import DictConfig
from libai.config import LazyCall
from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining
cfg = dict(
vocab_size=250112,
hidden_size=512,
hidden_layers=8,
num_attention_heads=6,
head_size=64,
intermediate_size=1024,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
embedding_dropout_prob=0.1,
relative_attention_num_buckets=32,
initializer_range=1.0,
layernorm_eps=1e-06,
amp_enabled=False,
model_type="mt5",
eos_token_id=1,
padding_idx=0,
is_encoder_decoder=True,
tie_word_embeddings=False,
)
cfg = DictConfig(cfg)
mt5_model = LazyCall(MT5Model)(cfg=cfg)
pretrain_model = LazyCall(MT5ForPreTraining)(cfg=cfg)
from .mt5_base import cfg
from libai.config import LazyCall
from libai.tokenizer import T5Tokenizer
from projects.MT5.mt5_model import MT5Model, MT5ForPreTraining
from configs.common.train import train
from configs.common.data.t5_dataset import tokenization
cfg.update(
model_type="t5",
is_encoder_decoder=True,
max_length=20,
min_length=0,
do_sample=False,
early_stopping=False,
num_beams=1,
num_beam_groups=1,
diversity_penalty=0.0,
temperature=1.0,
top_k=50,
top_p=1.0,
typical_p=1.0,
repetition_penalty=1.0,
length_penalty=1.0,
no_repeat_ngram_size=0,
encoder_no_repeat_ngram_size=0,
num_return_sequences=1,
chunk_size_feed_forward=0,
output_scores=False,
forced_bos_token_id=None,
forced_eos_token_id=None,
remove_invalid_values=False,
exponential_decay_length_penalty=None,
use_cache=True,
# Tokenizer
pad_token_id=0,
eos_token_id=1,
bos_token_id=None,
sep_token_id=None,
decoder_start_token_id=0,
)
model = LazyCall(MT5Model)(cfg=cfg)
tokenization.tokenizer = LazyCall(T5Tokenizer)(
vocab_file="/path/to/spiece.model",
add_bos_token=True,
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from typing import Tuple
import oneflow as flow
from oneflow import nn
from libai.layers.linear import Linear
from libai.utils import distributed as dist
from projects.MT5.layers.embed_layer import Embedding
class MultiheadAttention(nn.Module):
"""Multi-head attention layer, support self attention and cross attention.
Args:
hidden_size: size of hidden state.
num_attention_heads: number of attention heads.
is_cross_attention: used to specify whether it is self attention or cross attention.
Defaults to False.
attention_dropout_prob: dropout probability of attention weights.
Defaults to 0.0.
output_dropout_prob: dropout probability of output. Defaults to 0.0.
init_method: method to initialize the input layer weights.
Defaults to ``init.xavier_normal_``.
output_layer_init_method: method to initialize the output layer weights.
If None, use ``init_method``.
layer_idx: a layer_idx sign which determines the placements.
It will be used in pipeline parallelism. Defaults to 0.
"""
def __init__(
self,
hidden_size,
num_attention_heads,
head_size,
relative_attention_num_buckets,
is_cross_attention=False,
attention_dropout_prob=0.0,
output_dropout_prob=0.0,
init_method=nn.init.xavier_normal_,
output_layer_init_method=None,
padding_idx=None,
*,
layer_idx=0,
has_relative_attention_bias=False,
is_decoder=False,
):
super().__init__()
self.hidden_size = hidden_size
self.relative_attention_num_buckets = relative_attention_num_buckets
self.has_relative_attention_bias = has_relative_attention_bias
self.is_decoder = is_decoder
self.attention_dropout_prob = attention_dropout_prob
if output_layer_init_method is None:
output_layer_init_method = init_method
self.num_heads = num_attention_heads
self.head_size = head_size
self.dropout = nn.Dropout(p=attention_dropout_prob)
self.norm_factor = 1.0 / math.sqrt(float(self.head_size))
self.is_cross_attention = is_cross_attention
self.output_dropout = nn.Dropout(p=output_dropout_prob)
if self.is_cross_attention:
self.query = Linear(
self.hidden_size,
self.num_heads * self.head_size,
bias=False,
parallel="col",
init_method=init_method,
layer_idx=layer_idx,
)
self.key_value = Linear(
self.hidden_size,
self.num_heads * self.head_size * 2,
bias=False,
parallel="col",
init_method=init_method,
layer_idx=layer_idx,
)
else:
self.query_key_value = Linear(
self.hidden_size,
self.num_heads * self.head_size * 3,
bias=False,
parallel="col",
init_method=init_method,
layer_idx=layer_idx,
)
self.dense = Linear(
self.num_heads * self.head_size,
self.hidden_size,
bias=False,
parallel="row",
init_method=output_layer_init_method,
skip_bias_add=False,
layer_idx=layer_idx,
)
if self.has_relative_attention_bias:
self.relative_attention_bias = Embedding(
self.relative_attention_num_buckets,
self.num_heads,
padding_idx=padding_idx,
layer_idx=layer_idx,
)
def forward(
self,
hidden_states: flow.Tensor,
encoder_states: flow.Tensor = None,
attention_mask: flow.Tensor = None,
past_key_value: Tuple[flow.Tensor, flow.Tensor] = None,
use_cache: bool = False,
position_bias=None,
query_length=None,
):
"""
Args:
hidden_states (flow.Tensor): shape is [bsz, tgt_len, hidden_size].
encoder_states (flow.Tensor, optional): shape is [bsz, src_len, hidden_size].
Defaults to None.
attention_mask (flow.Tensor, optional): shape is [bsz, 1, tgt_len, src_len].
It should be the combination of padding mask and casual mask.
It is the padding mask of source input when used with self-attention in encoder.
And it is the combination of padding mask of target input and casual mask when
used with self-attention in decoder. It is the padding mask of source input when
used with cross-attention in decoder.
Defaults to None.
past_key_value (Tuple[flow.Tensor, flow.Tensor], optional): tuple of key and value,
each shape is [bsz, num_heads, src_len, head_size]. Defaults to None.
use_cache (bool, optional): it will be set to True, when the model is in the inference
phase and used for incremental decoding. Defaults to False.
"""
if encoder_states is not None:
encoder_states = encoder_states.to_global(placement=hidden_states.placement)
if attention_mask is not None:
attention_mask = attention_mask.to_global(placement=hidden_states.placement)
# hidden_states shape: [seq_len, batch_size, hidden_size]
real_seq_length, bsz = hidden_states.size()[:2]
if past_key_value is not None:
assert (
len(past_key_value) == 2
), "past_key_value should have 2 past states: keys and values."
f"Got {len(past_key_value)} past states.\n"
real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
key_length = real_seq_length if encoder_states is None else encoder_states.shape[0]
if self.is_cross_attention:
query = self.query(hidden_states)
query = query.view(-1, bsz, self.num_heads, self.head_size)
query = query.permute(1, 2, 0, 3) # bsz, num_head, seq_len, head_size
if past_key_value is not None:
key, value = past_key_value
elif encoder_states is not None:
key_value = self.key_value(encoder_states)
key_value = key_value.view(-1, bsz, self.num_heads, 2 * self.head_size)
key_value = key_value.permute(1, 2, 0, 3)
key, value = flow.chunk(key_value, chunks=2, dim=-1)
else:
raise ValueError(
"past_key_value and encoder_states cannot be None at the same time."
)
else:
query_key_value = self.query_key_value(hidden_states)
if use_cache:
query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size)
query_key_value = query_key_value.permute(
0, 2, 1, 3
) # [bsz, num_heads, src_len, 3 * head_size]
query, key, value = flow.chunk(query_key_value, chunks=3, dim=-1)
else:
attention_scores, value = flow._C.fused_self_attention(
query_key_value, head_size=self.head_size, alpha=1
)
if past_key_value is not None:
past_key, past_value = past_key_value
key = flow.cat((past_key.type_as(key), key), dim=2)
value = flow.cat((past_value.type_as(value), value), dim=2)
if use_cache:
past_key_value = (key, value)
if self.is_cross_attention or use_cache:
attention_scores = flow.matmul(query, key, transpose_b=True, alpha=1)
if position_bias is None:
if not self.has_relative_attention_bias:
position_bias = flow.zeros(
(1, self.num_heads, real_seq_length, key_length),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=attention_scores.placement,
)
else:
position_bias = self.compute_bias(
real_seq_length, key_length, placement=attention_mask.placement
)
if past_key_value is not None:
position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
if attention_mask is not None:
if use_cache:
attention_mask = attention_mask.expand_as(attention_scores)
attention_weights = flow._C.fused_bias_add_scale_mask_softmax_dropout(
attention_scores,
position_bias,
attention_mask,
fill_value=-10000.0,
scale=1,
p=self.attention_dropout_prob,
)[0]
else:
attention_scores = attention_scores + position_bias
attention_weights = flow.softmax(attention_scores, dim=-1)
attention_weights = self.dropout(attention_weights)
context = flow.matmul(attention_weights, value)
""" transpose [batch_size, num_head, seq_len, head_size] to
[seq_len, batch_size, num_head, head_size]
"""
context = flow._C.transpose(context, perm=(2, 0, 1, 3))
output = self.dense(context.flatten(2))
output = self.output_dropout(output)
if use_cache:
output = (output, past_key_value)
output = (output,) + (position_bias,)
return output
def extra_repr(self) -> str:
return "hidden_size={}, num_heads={}, is_cross_attention={}".format(
self.hidden_size,
self.num_heads,
self.is_cross_attention,
)
def _relative_position_bucket(
self, relative_position, bidirectional=True, num_buckets=32, max_distance=128
):
relative_buckets = 0
if bidirectional:
num_buckets //= 2
relative_buckets = (
relative_buckets + (relative_position > 0).to(flow.long) * num_buckets
)
relative_position = flow.abs(relative_position)
else:
relative_position = (
-1
* flow.min(
relative_position,
flow.zeros(
relative_position.size(),
sbp=relative_position.sbp,
placement=relative_position.placement,
),
).to(flow.long)
)
max_exact = num_buckets // 2
is_small = relative_position < max_exact
relative_postion_if_large = max_exact + (
flow.log(relative_position.float() / max_exact)
/ math.log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(flow.long)
relative_postion_if_large = flow.min(
relative_postion_if_large,
flow.zeros(
relative_postion_if_large.size(),
dtype=relative_postion_if_large.dtype,
sbp=relative_postion_if_large.sbp,
placement=relative_postion_if_large.placement,
).fill_(num_buckets - 1),
)
relative_buckets = relative_buckets + flow.where(
is_small, relative_position, relative_postion_if_large
)
return relative_buckets
def compute_bias(self, query_length, key_length, placement=None):
"""Compute binned relative position bias"""
context_position = flow.arange(
query_length,
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=placement,
)
memory_position = flow.arange(
key_length,
dtype=flow.long,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=placement,
)
relative_position = (
memory_position[None, :] - context_position[:, None]
) # shape (query_length, key_length)
relative_position_bucket = self._relative_position_bucket(
relative_position,
bidirectional=(not self.is_decoder),
num_buckets=self.relative_attention_num_buckets,
) # shape (query_length, key_length)
values = self.relative_attention_bias(
relative_position_bucket
) # shape (query_length, key_length, num_heads)
values = values.permute([2, 0, 1]).unsqueeze(
0
) # shape (1, num_heads, query_length, key_length)
return values
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
import oneflow.nn as nn
from oneflow.nn import init
import libai.utils.distributed as dist
from libai.layers.embedding import VocabEmbedding
class MT5Embedding(flow.nn.Module):
def __init__(
self,
hidden_size,
vocab_size,
embedding_dropout_prob,
pad_token_id=0,
init_method=flow.nn.init.xavier_normal_,
amp_enabled=False,
) -> None:
super().__init__()
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.word_embeddings = VocabEmbedding(
num_embeddings=vocab_size,
embedding_dim=hidden_size,
init_method=init_method,
amp_enabled=amp_enabled,
padding_idx=pad_token_id,
)
self.embedding_dropout = flow.nn.Dropout(embedding_dropout_prob)
def forward(self, input_ids):
word_embeddings = self.word_embeddings(input_ids)
embeddings = self.embedding_dropout(word_embeddings)
return embeddings
class Embedding(nn.Module):
"""Construct the trainable embedding module, which does not support parallelization.
This can be used for positional embedding and token type embedding.
Arguments:
num_embeddings: size of vocabulary.
embedding_dim: dimension of embeddings.
padding_idx: pad index. Defaults to None.
init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
amp_enabled: fp16 option for embedding weight. Defaults to False.
"""
def __init__(
self,
num_embeddings,
embedding_dim,
padding_idx=None,
init_method=init.xavier_normal_,
amp_enabled=False,
layer_idx=0,
):
super().__init__()
self.num_embeddings = num_embeddings
self.embedding_dim = embedding_dim
if padding_idx is not None:
if padding_idx > 0:
assert (
padding_idx < self.num_embeddings
), "Padding_idx must be within num_embeddings"
elif padding_idx < 0:
assert (
padding_idx >= -self.num_embeddings
), "Padding_idx must be within num_embeddings"
padding_idx = self.num_embeddings + padding_idx
self.padding_idx = padding_idx
self.init_method = init_method
self.amp_enabled = amp_enabled
assert num_embeddings > 0
self.weight = nn.Parameter(
flow.empty(
(num_embeddings, embedding_dim),
dtype=flow.float32,
placement=dist.get_layer_placement(layer_idx),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
)
self.init_method(self.weight)
def forward(self, input_ids):
weight = flow._C.amp_white_identity(self.weight) if self.amp_enabled else self.weight
input_embeds = flow._C.gather(weight, input_ids, axis=0)
return input_embeds
def _fill_padding_idx_with_zero(self) -> None:
if self.padding_idx is not None:
with flow.no_grad():
self.weight[self.padding_idx] = flow.zeros(
self.embedding_dim,
placement=dist.get_layer_placement(0),
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
)
def extra_repr(self) -> str:
s = "num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
if self.padding_idx is not None:
s += ", padding_idx={padding_idx}"
return s.format(**self.__dict__)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from oneflow import nn
from libai.layers import Linear, LMLogits
class LMHead(nn.Module):
def __init__(self, model_type, hidden_size, vocab_size, hidden_layers):
super().__init__()
if model_type == "mt5":
self.lm_head = Linear(
hidden_size, vocab_size, bias=False, layer_idx=2 * hidden_layers - 1
)
else:
self.lm_head = LMLogits(vocab_size, bias=True)
def forward(self, decoder_states, embed_weight=None):
if isinstance(self.lm_head, Linear):
logits = self.lm_head(decoder_states)
else:
logits = self.lm_head(decoder_states, embed_weight)
return logits
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment