Commit 5988d2cc authored by yuguo960516's avatar yuguo960516
Browse files

bert-large

parent 478602ba
Pipeline #142 canceled with stages
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import oneflow as flow
from .base_loader import ModelLoaderHuggerFace, ModelLoaderLiBai
class ViTLoaderHuggerFace(ModelLoaderHuggerFace):
def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
"""NOTE: base_model_prefix_1 is ViT's prefix in Transformers.
base_model_prefix_2 is ViT's prefix in LiBai."""
self.base_model_prefix_1 = "vit"
self.base_model_prefix_2 = ""
def _convert_state_dict(self, flow_state_dict, cfg=None):
"""Convert state_dict's keys to match model.
Args:
flow_state_dict (OrderedDict): model state dict.
cfg (dict): model's default config dict.
Returns:
OrderedDict: flow state dict.
"""
# The converted checkpoint.
oneflow_state_dict = flow_state_dict.copy()
# Get configs
num_heads = cfg.get("num_heads")
hidden_size = cfg.get("embed_dim")
head_size = int(hidden_size / num_heads)
# prefix
has_prefix = any(s.startswith(self.base_model_prefix_1) for s in oneflow_state_dict)
index_idx = 3 if has_prefix else 2
old_keys = oneflow_state_dict.keys()
for key in list(old_keys):
# Convert vit's embedding layers
if "embeddings" in key:
if "cls_token" in key:
new_key = "cls_token"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "position_embeddings" in key:
new_key = "pos_embed"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "patch_embeddings.projection" in key:
if "weight" in key:
new_key = "patch_embed.proj.weight"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "bias" in key:
new_key = "patch_embed.proj.bias"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
# Convert vit's layernorm layers
elif "layernorm_before" in key:
index_block = key.split(".")[index_idx]
if "weight" in key:
new_key = "blocks." + index_block + ".input_layernorm.weight"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "bias" in key:
new_key = "blocks." + index_block + ".input_layernorm.bias"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "layernorm_after" in key:
index_block = key.split(".")[index_idx]
if "weight" in key:
new_key = "blocks." + index_block + ".post_attention_layernorm.weight"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "bias" in key:
new_key = "blocks." + index_block + ".post_attention_layernorm.bias"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
# Convert vit's attention layers
elif "attention" in key:
index_block = key.split(".")[index_idx]
if "attention.attention" in key:
if (
"blocks." + index_block + ".self_attention.query_key_value.weight"
in oneflow_state_dict.keys()
):
continue
q_w = key
k_w = q_w.replace("query", "key")
v_w = q_w.replace("query", "value")
q_b = q_w.replace("weight", "bias")
k_b = k_w.replace("weight", "bias")
v_b = v_w.replace("weight", "bias")
qkv_w = flow.cat(
(
oneflow_state_dict.pop(q_w),
oneflow_state_dict.pop(k_w),
oneflow_state_dict.pop(v_w),
),
dim=0,
)
qkv_b = flow.cat(
(
oneflow_state_dict.pop(q_b),
oneflow_state_dict.pop(k_b),
oneflow_state_dict.pop(v_b),
),
dim=-1,
)
qkv_w = self._fix_qkv_ordering(qkv_w, head_size, num_heads)
qkv_b = self._fix_qkv_ordering(qkv_b, head_size, num_heads)
new_key = "blocks." + index_block + ".self_attention.query_key_value.weight"
oneflow_state_dict[new_key] = qkv_w
new_key = new_key.replace("weight", "bias")
oneflow_state_dict[new_key] = qkv_b
elif "output" in key:
if "dense" in key:
if "weight" in key:
new_key = "blocks." + index_block + ".self_attention.dense.weight"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
if "bias" in key:
new_key = "blocks." + index_block + ".self_attention.dense.bias"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "intermediate" in key:
index_block = key.split(".")[index_idx]
if "weight" in key:
if (
"blocks." + index_block + ".mlp.dense_h_to_4h.weight"
in oneflow_state_dict.keys()
):
continue
w = key
b = key.replace("weight", "bias")
new_key = "blocks." + index_block + ".mlp.dense_h_to_4h.weight"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(w)
new_key = new_key.replace("weight", "bias")
oneflow_state_dict[new_key] = oneflow_state_dict.pop(b)
elif "output" in key:
index_block = key.split(".")[index_idx]
if "dense.weight" in key:
if (
"blocks." + index_block + ".mlp.dense_4h_to_h.weight"
in oneflow_state_dict.keys()
):
continue
w = key
b = w.replace("weight", "bias")
new_key = "blocks." + index_block + ".mlp.dense_4h_to_h.weight"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(w)
new_key = new_key.replace("weight", "bias")
oneflow_state_dict[new_key] = oneflow_state_dict.pop(b)
elif "layernorm" in key:
if "weight" in key:
new_key = "norm.weight"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "bias" in key:
new_key = "norm.bias"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "classifier" in key:
if "weight" in key:
new_key = "head.weight"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
elif "bias" in key:
new_key = "head.bias"
oneflow_state_dict[new_key] = oneflow_state_dict.pop(key)
else:
oneflow_state_dict[key] = oneflow_state_dict.pop(key)
return oneflow_state_dict
def _load_config_from_json(self, config_file):
"""load config from `config.json`, and update default config.
Args:
config_file (str): Path of config file.
"""
with open(config_file, mode="r", encoding="utf-8") as f:
cfg_dict = json.load(f)
# update libai_cfg by config.json
self._update_cfg("img_size", cfg_dict["image_size"])
self._update_cfg("patch_size", cfg_dict["patch_size"])
self._update_cfg("in_chans", cfg_dict["num_channels"])
self._update_cfg("embed_dim", cfg_dict["hidden_size"])
self._update_cfg("depth", cfg_dict["num_hidden_layers"])
self._update_cfg("num_heads", cfg_dict["num_attention_heads"])
self._update_cfg("attn_drop_rate", cfg_dict["attention_probs_dropout_prob"])
self._update_cfg("drop_rate", cfg_dict["hidden_dropout_prob"])
# update libai_cfg by kwargs
for k, v in self.kwargs.items():
self._update_cfg(k, v)
self._update_cfg_log()
class ViTLoaderLiBai(ModelLoaderLiBai):
def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
self.base_model_prefix_2 = ""
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import oneflow.nn as nn
def init_method_normal(sigma, mean=0.0):
"""Init method based on N(0, sigma)."""
def init_(tensor):
return nn.init.normal_(tensor, mean=mean, std=sigma)
return init_
def scaled_init_method_normal(sigma, num_layers, mean=0.0):
"""Init method based on N(0, sigma/sqrt(2*num_layers)."""
std = sigma / math.sqrt(2.0 * num_layers)
def init_(tensor):
return nn.init.normal_(tensor, mean=mean, std=std)
return init_
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
import oneflow.nn as nn
from flowvision.layers.weight_init import trunc_normal_
import libai.utils.distributed as dist
from libai.config.config import configurable
from libai.layers import LayerNorm, Linear, PatchEmbedding, TransformerLayer
class VisionTransformer(nn.Module):
"""Vision Transformer in LiBai.
LiBai's implementation of:
`An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale
<https://arxiv.org/abs/2010.11929>`_
Args:
img_size (int, tuple(int)): input image size
patch_size (int, tuple(int)): patch size
in_chans (int): number of input channels
embed_dim (int): embedding dimension
depth (int): depth of transformer
num_heads (int): number of attention heads
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
drop_rate (float): dropout rate
attn_drop_rate (float): attention dropout rate
drop_path_rate (float): stochastic depth rate
num_classes (int): number of classes for classification head
loss_func (callable, optional): loss function for computing the total loss
between logits and labels
"""
@configurable
def __init__(
self,
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=192,
depth=12,
num_heads=3,
mlp_ratio=4.0,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.0,
num_classes=1000,
loss_func=None,
):
super().__init__()
self.img_size = img_size
self.num_classes = num_classes
self.patch_embed = PatchEmbedding(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
)
ffn_size = int(embed_dim * mlp_ratio)
num_patches = self.patch_embed.num_patches
self.cls_token = nn.Parameter(
flow.zeros(
1,
1,
embed_dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(0),
)
)
self.pos_embed = nn.Parameter(
flow.zeros(
1,
num_patches + 1,
embed_dim,
sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
placement=dist.get_layer_placement(0),
)
)
self.pos_drop = nn.Dropout(p=drop_rate)
dpr = [
x.item() for x in flow.linspace(0, drop_path_rate, depth)
] # stochastic depth decay rule
self.blocks = nn.Sequential(
*[
TransformerLayer(
hidden_size=embed_dim,
ffn_hidden_size=ffn_size,
num_attention_heads=num_heads,
attention_dropout_prob=attn_drop_rate,
output_dropout_prob=drop_rate,
drop_path_prob=dpr[i],
layer_idx=i,
)
for i in range(depth)
]
)
self.norm = LayerNorm(embed_dim, layer_idx=-1)
self.head = Linear(embed_dim, num_classes, layer_idx=-1)
# loss func
self.loss_func = nn.CrossEntropyLoss() if loss_func is None else loss_func
# weight init
trunc_normal_(self.pos_embed, std=0.02)
trunc_normal_(self.cls_token, std=0.02)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, Linear):
trunc_normal_(m.weight, std=0.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def no_weight_decay(self):
return {"pos_embed", "cls_token"}
@classmethod
def from_config(cls, cfg):
return {
"img_size": cfg.img_size,
"patch_size": cfg.patch_size,
"in_chans": cfg.in_chans,
"embed_dim": cfg.embed_dim,
"depth": cfg.depth,
"num_heads": cfg.num_heads,
"mlp_ratio": cfg.mlp_ratio,
"drop_rate": cfg.drop_rate,
"attn_drop_rate": cfg.attn_drop_rate,
"drop_path_rate": cfg.drop_path_rate,
"num_classes": cfg.num_classes,
"loss_func": cfg.loss_func,
}
def forward_features(self, x):
# patch embedding
x = self.patch_embed(x)
cls_token = self.cls_token.expand(
x.shape[0], -1, -1
) # stole cls_tokens impl from Phil Wang, thanks
cls_token = cls_token.to_global(sbp=x.sbp, placement=cls_token.placement)
x = flow.cat((cls_token, x), dim=1)
# position embedding
pos_embed = self.pos_embed.expand(x.shape[0], -1, -1)
pos_embed = pos_embed.to_global(sbp=x.sbp, placement=pos_embed.placement)
x = self.pos_drop(x + pos_embed)
# transformer block
x = self.blocks(x)
return x
def forward_head(self, x):
x = self.norm(x)
outcome = x[:, 0]
outcome = self.head(outcome)
return outcome
def forward(self, images, labels=None):
"""
Args:
images (flow.Tensor): training samples.
labels (flow.LongTensor, optional): training targets
Returns:
dict:
A dict containing :code:`loss_value` or :code:`logits`
depending on training or evaluation mode.
:code:`{"losses": loss_value}` when training,
:code:`{"prediction_scores": logits}` when evaluating.
"""
x = self.forward_features(images)
x = self.forward_head(x)
if labels is not None and self.training:
losses = self.loss_func(x, labels)
return {"losses": losses}
else:
return {"prediction_scores": x}
@staticmethod
def set_pipeline_stage_id(model):
dist_utils = dist.get_dist_util()
# Set pipeline parallelism stage_id
if hasattr(model.pos_embed, "config"):
# Old API in OneFlow 0.8
for module_block in model.modules():
if isinstance(module_block.origin, PatchEmbedding):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.origin, TransformerLayer):
module_block.config.set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
# Set pos_embed and cls_token stage id
model.pos_embed.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
model.cls_token.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
model.pos_drop.config.set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
model.norm.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.head.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.loss_func.config.set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
else:
for module_block in model.modules():
if isinstance(module_block.to(nn.Module), PatchEmbedding):
module_block.to(flow.nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
elif isinstance(module_block.to(nn.Module), TransformerLayer):
module_block.to(flow.nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(module_block.layer_idx),
dist.get_layer_placement(module_block.layer_idx),
)
# Set pos_embed and cls_token stage id
model.pos_embed.to(flow.nn.graph.GraphTensor).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
model.cls_token.to(flow.nn.graph.GraphTensor).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
model.pos_drop.to(flow.nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(0), dist.get_layer_placement(0)
)
model.norm.to(flow.nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.head.to(flow.nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
model.loss_func.to(flow.nn.graph.GraphModule).set_stage(
dist_utils.get_layer_stage_id(-1), dist.get_layer_placement(-1)
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check
from libai.config import LazyConfig
from libai.models.utils import GPT2LoaderLiBai
from projects.MagicPrompt.gpt2 import GPTModel
def get_model(config_file):
cfg = LazyConfig.load(config_file)
cfg.model.cfg.pretrained_model_path = None
cfg.dataloader = None
cfg.tokenization = None
print("Building model....")
loader = GPT2LoaderLiBai(GPTModel, cfg.cfg, "/path/to/model")
model = loader.load()
print("Build model finished.")
return model
class gpt2Graph(nn.Graph):
def __init__(self, eager_model):
super().__init__()
self.model = eager_model
def build(
self,
input_ids,
):
out = self.model(
input_ids,
)
return out
if __name__ == "__main__":
model = get_model("projects/MagicPrompt/configs/gpt2_inference.py")
model.eval()
gpt2_graph = gpt2Graph(model)
# Build the static graph model
input_ids = flow.ones(
1, 5, dtype=flow.int64, sbp=flow.sbp.broadcast, placement=flow.placement("cuda", ranks=[0])
)
# check your model.forward is valid
# output = gpt2_graph(
# input_ids
# )
print("Compiling the graph which may make some time, please wait for a moment....")
gpt2_graph._compile(
input_ids,
)
convert_to_onnx_and_check(
gpt2_graph,
external_data=False,
opset=11,
flow_weight_dir=None,
onnx_model_path="./",
dynamic_batch_size=False,
device="gpu_global",
input_tensor_range=[0, 10],
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
from typing import List
import numpy as np
import onnxruntime as ort
class OnnxModel:
def __init__(
self,
onnx_filename,
providers: List[str] = None,
ort_optimize: bool = True,
):
ort_sess_opt = ort.SessionOptions()
ort_sess_opt.graph_optimization_level = (
ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
if ort_optimize
else ort.GraphOptimizationLevel.ORT_DISABLE_ALL
)
if providers is None:
if ort.__version__ > "1.9.0":
providers = [
"TensorrtExecutionProvider",
"CUDAExecutionProvider",
"CPUExecutionProvider",
]
else:
providers = ["CPUExecutionProvider"]
self.sess = ort.InferenceSession(
onnx_filename, sess_options=ort_sess_opt, providers=providers
)
def forward(self, input_list):
ipt_dict = OrderedDict()
for idx, ipt in enumerate(self.sess.get_inputs()):
ipt_dict[ipt.name] = input_list[idx]
onnx_res = self.sess.run([], ipt_dict)
return onnx_res
if __name__ == "__main__":
onnx_model = OnnxModel("model.onnx")
input_list = [
np.ones((1, 5)).astype(np.int64).astype(np.int64),
]
print(onnx_model.forward(input_list))
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import OrderedDict
from typing import List
import numpy as np
import onnxruntime as ort
class OnnxModel:
def __init__(
self,
onnx_filename,
providers: List[str] = None,
ort_optimize: bool = True,
):
ort_sess_opt = ort.SessionOptions()
ort_sess_opt.graph_optimization_level = (
ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
if ort_optimize
else ort.GraphOptimizationLevel.ORT_DISABLE_ALL
)
if providers is None:
if ort.__version__ > "1.9.0":
providers = [
"TensorrtExecutionProvider",
"CUDAExecutionProvider",
"CPUExecutionProvider",
]
else:
providers = ["CPUExecutionProvider"]
self.sess = ort.InferenceSession(
onnx_filename, sess_options=ort_sess_opt, providers=providers
)
def forward(self, input_list):
ipt_dict = OrderedDict()
for idx, ipt in enumerate(self.sess.get_inputs()):
ipt_dict[ipt.name] = input_list[idx]
onnx_res = self.sess.run([], ipt_dict)
return onnx_res
if __name__ == "__main__":
onnx_model = OnnxModel("model.onnx")
input_list = [
np.ones((1, 5)).astype(np.int64),
np.ones((1, 3)).astype(np.int64),
np.ones((1, 5, 5)).astype(bool),
np.ones((1, 3, 3)).astype(bool),
np.ones((1, 3, 5)).astype(bool),
]
print(onnx_model.forward(input_list))
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import oneflow as flow
from oneflow import nn
from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check
from libai.config import LazyConfig
from projects.MT5.mt5_model import MT5Model
from projects.MT5.utils.mt5_loader import T5LoaderHuggerFace
def get_model(config_file):
cfg = LazyConfig.load(config_file)
cfg.model.cfg.model_type = "mt5"
cfg.model.cfg.pretrained_model_path = None
cfg.dataloader = None
cfg.tokenization = None
print("Building model....")
loader = T5LoaderHuggerFace(MT5Model, cfg.model.cfg, "/path/to/model")
model = loader.load()
print("Build model finished.")
return model
class t5Graph(nn.Graph):
def __init__(self, eager_model):
super().__init__()
self.model = eager_model
def build(
self,
encoder_input_ids,
encoder_attn_mask,
decoder_input_ids,
decoder_attn_mask,
encoder_decoder_attn_mask,
):
out = self.model(
encoder_input_ids,
encoder_attn_mask,
decoder_input_ids,
decoder_attn_mask,
encoder_decoder_attn_mask,
)
return out
if __name__ == "__main__":
model = get_model("projects/MT5/configs/mt5_pretrain.py")
model.eval()
t5_graph = t5Graph(model)
# Build the static graph model
encoder_input_ids = flow.ones(
1, 5, dtype=flow.int64, sbp=flow.sbp.broadcast, placement=flow.placement("cuda", ranks=[0])
)
encoder_attn_mask = flow.ones(
1, 3, dtype=flow.int64, sbp=flow.sbp.broadcast, placement=flow.placement("cuda", ranks=[0])
)
decoder_input_ids = flow.ones(
1,
5,
5,
dtype=flow.bool,
sbp=flow.sbp.broadcast,
placement=flow.placement("cuda", ranks=[0]),
)
decoder_attn_mask = flow.ones(
1,
3,
3,
dtype=flow.bool,
sbp=flow.sbp.broadcast,
placement=flow.placement("cuda", ranks=[0]),
)
encoder_decoder_attn_mask = flow.ones(
1,
3,
5,
dtype=flow.bool,
sbp=flow.sbp.broadcast,
placement=flow.placement("cuda", ranks=[0]),
)
# check your model.forward is valid
# output = t5_graph(
# encoder_input_ids,
# encoder_attn_mask,
# decoder_input_ids,
# decoder_attn_mask,
# encoder_decoder_attn_mask
# )
# print(output)
print("Compiling the graph which may make some time, please wait for a moment....")
t5_graph._compile(
encoder_input_ids,
encoder_attn_mask,
decoder_input_ids,
decoder_attn_mask,
encoder_decoder_attn_mask,
)
convert_to_onnx_and_check(
t5_graph,
external_data=False,
opset=11,
flow_weight_dir=None,
onnx_model_path="./",
dynamic_batch_size=False,
device="gpu_global",
input_tensor_range=[0, 10],
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .build import build_optimizer, get_default_optimizer_params
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
from collections import defaultdict
from typing import Any, Dict, List
import oneflow as flow
from libai.config import instantiate
from libai.layers import LayerNorm
# --------------------------------------------------------
# References:
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/solver/build.py
# --------------------------------------------------------
def build_optimizer(cfg, model):
"""
Build an optimizer from config.
"""
cfg.params.model = model
optim = instantiate(cfg)
return optim
def get_default_optimizer_params(
model,
base_lr=None,
weight_decay=None,
weight_decay_norm=None,
weight_decay_bias=None,
clip_grad_max_norm=None,
clip_grad_norm_type=None,
overrides=None,
):
"""
Get default param list for optimizer, with suport for a few types of overrides.
If no overrides are needed, it is equivalent to `model.parameters()`.
Arguments:
base_lr: lr for every group by default. Can be omitted to use the one in optimizer.
weight_decay: weight decay for every group by default. Can be omitted to use the one
in optimizer.
weight_decay_norm: override weight decay for params in normalization layers
weight_decay_bias: override weight decay for bias parameters
overrides: if not `None`, provides values for optimizer hyperparameters
(LR, weight decay) for module parameters with a given name; e.g.
``{"embedding": {"lr": 0.01, "weight_decay": 0.1}}`` will set the LR and
weight decay values for all module parameters named `embedding`.
For common transformer models, ``weight_decay_norm`` and ``weight_decay_bias``
are usually set to 0.
Example:
::
flow.optim.AdamW(
get_default_optimizer_params(model, weight_decay_norm=0, weight_decay_bias=0),
lr=0.01,
weight_decay=1e-4
)
"""
if overrides is None:
overrides = {}
defaults = {}
if base_lr is not None:
defaults["lr"] = base_lr
if weight_decay is not None:
defaults["weight_decay"] = weight_decay
if clip_grad_max_norm is not None and clip_grad_norm_type is not None:
defaults["clip_grad_max_norm"] = clip_grad_max_norm
defaults["clip_grad_norm_type"] = clip_grad_norm_type
bias_overrides = {}
if weight_decay_bias is not None:
bias_overrides["weight_decay"] = weight_decay_bias
if len(bias_overrides):
if "bias" in overrides:
raise ValueError("Conflicting overrides for 'bias'")
overrides["bias"] = bias_overrides
norm_module_types = (
LayerNorm,
flow.nn.BatchNorm1d,
flow.nn.BatchNorm2d,
flow.nn.BatchNorm3d,
flow.nn.GroupNorm,
flow.nn.InstanceNorm1d,
flow.nn.InstanceNorm2d,
flow.nn.InstanceNorm3d,
flow.nn.FusedBatchNorm1d,
flow.nn.FusedBatchNorm2d,
flow.nn.FusedBatchNorm3d,
)
params = []
memo = set()
for module in model.modules():
for model_param_name, value in module.named_parameters(recurse=False):
if not value.requires_grad:
continue
# Avoid duplicating parameters
if value in memo:
continue
memo.add(value)
hyperparams = copy.copy(defaults)
if isinstance(module, norm_module_types) and weight_decay_norm is not None:
hyperparams["weight_decay"] = weight_decay_norm
hyperparams.update(overrides.get(model_param_name, {}))
params.append({"params": [value], **hyperparams})
return reduce_param_groups(params)
def _expand_param_groups(params: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Transform parameter groups into per-parameter structure.
Later items in `params` can overwrite parameters set in previous items.
"""
ret = defaultdict(dict)
for item in params:
assert "params" in item
cur_params = {x: y for x, y in item.items() if x != "params"}
for param in item["params"]:
ret[param].update({"params": [param], **cur_params})
return list(ret.values())
def reduce_param_groups(params: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Reorganize the parameter groups and merge duplicated groups.
The number of parameter groups needs to be as small as possible in order
to efficiently use the OneFlow multi-tensor optimizer. Therefore instead
of using a parameter_group per single parameter, we reorganize the
parameter groups and merge duplicated groups. This approach speeds
up multi-tensor optimizer significantly.
"""
params = _expand_param_groups(params)
groups = defaultdict(list) # re-group all parameter groups by their hyperparams
for item in params:
cur_params = tuple((x, y) for x, y in item.items() if x != "params")
groups[cur_params].extend(item["params"])
ret = []
for param_keys, param_values in groups.items():
cur = {kv[0]: kv[1] for kv in param_keys}
cur["params"] = param_values
ret.append(cur)
return ret
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .build import build_lr_scheduler
from .lr_scheduler import (
WarmupCosineAnnealingLR,
WarmupCosineLR,
WarmupExponentialLR,
WarmupMultiStepLR,
WarmupPolynomialLR,
WarmupStepLR,
)
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from libai.config import instantiate
def build_lr_scheduler(cfg, optimizer):
"""Build learning rate scheduler, defined by ``cfg``."""
cfg.optimizer = optimizer
scheduler = instantiate(cfg)
return scheduler
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import oneflow as flow
logger = logging.getLogger(__name__)
def WarmupCosineLR(
optimizer: flow.optim.Optimizer,
max_iter: int,
warmup_factor: float,
warmup_iter: int,
alpha: float = 0.0,
warmup_method: str = "linear",
):
"""Create a schedule with a learning rate that decreases following
the values of the Cosine function between the initial lr set in the
optimizer to 0, after a warmup period during which it increases linearly
between 0 and the initial lr set in the optimizer.
Args:
optimizer (flow.optim.Optimizer): Wrapped optimizer.
max_iter (int): Total training iters.
warmup_factor (float): The warmup factor.
warmup_iter (int): The number of warmup steps.
alpha (float, optional): The learning rate scale factor (:math:`\\alpha`). Defaults to 0.0.
warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
In linear mode, the multiplication factor starts with warmup_factor in
the first epoch and then inreases linearly to reach 1. Defaults to "linear".
"""
cosine_decay_lr = flow.optim.lr_scheduler.CosineDecayLR(
optimizer, decay_steps=max_iter, alpha=alpha
)
if warmup_iter == 0:
logger.warning("warmup iters equals to zero, return CosineLR")
return cosine_decay_lr
elif warmup_iter > max_iter:
logger.warning("warmup iters is larger than the total training iters")
warmup_cosine_lr = flow.optim.lr_scheduler.WarmUpLR(
cosine_decay_lr,
warmup_factor=warmup_factor,
warmup_iters=warmup_iter,
warmup_method=warmup_method,
)
return warmup_cosine_lr
def WarmupCosineAnnealingLR(
optimizer: flow.optim.Optimizer,
max_iter: int,
warmup_factor: float,
warmup_iter: int,
eta_min: float = 0.0,
warmup_method: str = "linear",
):
"""Create a schedule with a learning rate that decreases following
the values of the Cosine Annealing function between the initial
lr set in the optimizer to 0, after a warmup period during which
it increases linearly between 0 and the initial lr set in the optimizer.
Args:
optimizer (flow.optim.Optimizer): Wrapped optimizer.
max_iter (int): Total training iters.
warmup_factor (float): The warmup factor.
warmup_iter (int): The number of warmup steps.
eta_min (float, optional): Minimum learning rate. Defaults to 0.0.
warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
In linear mode, the multiplication factor starts with warmup_factor in the first epoch
and then inreases linearly to reach 1. Defaults to "linear".
"""
cosine_annealing_lr = flow.optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=max_iter, eta_min=eta_min
)
if warmup_iter == 0:
logger.warning("warmup iters equals to zero, return CosineAnnealingLR")
return cosine_annealing_lr
warmup_cosine_annealing_lr = flow.optim.lr_scheduler.WarmUpLR(
cosine_annealing_lr,
warmup_factor=warmup_factor,
warmup_iters=warmup_iter,
warmup_method=warmup_method,
)
return warmup_cosine_annealing_lr
def WarmupStepLR(
optimizer: flow.optim.Optimizer,
max_iter: int,
warmup_factor: float,
warmup_iter: int,
step_size: int,
gamma: float = 0.1,
warmup_method: str = "linear",
):
"""Create a schedule with a learning rate that decreases following the values of the Step
function between the initial lr set in the optimizer to 0, after a warmup period during which
it increases linearly between 0 and the initial lr set in the optimizer.
Args:
optimizer (flow.optim.Optimizer): Wrapped optimizer.
max_iter (int): Total training iters.
warmup_factor (float): The warmup factor.
warmup_iter (int): The number of warmup steps.
step_size (int): Period of learning rate decay.
gamma (float, optional): Multiplicative factor of learning rate decay. Defaults to 0.1.
warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
In linear mode, the multiplication factor starts with warmup_factor in the first
epoch and then inreases linearly to reach 1. Defaults to "linear".
"""
step_lr = flow.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
if warmup_iter == 0:
logger.warning("warmup iters equals to zero, return StepLR")
return step_lr
warmup_step_lr = flow.optim.lr_scheduler.WarmUpLR(
step_lr,
warmup_factor=warmup_factor,
warmup_iters=warmup_iter,
warmup_method=warmup_method,
)
return warmup_step_lr
def WarmupMultiStepLR(
optimizer: flow.optim.Optimizer,
max_iter: int,
warmup_factor: float,
warmup_iter: int,
milestones: list,
gamma: float = 0.1,
warmup_method: str = "linear",
):
"""Create a schedule with a learning rate that decreases following the values of the MultiStep
function between the initial lr set in the optimizer to 0, after a warmup period during which
it increases linearly between 0 and the initial lr set in the optimizer.
Args:
optimizer (flow.optim.Optimizer): Wrapped optimizer.
max_iter (int): Total training iters.
warmup_factor (float): The warmup factor.
warmup_iter (int): The number of warmup steps.
milestones (list): List of step indices. Must be increasing.
gamma (float, optional): Multiplicative factor of learning rate decay. Defaults to 0.1.
warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
In linear mode, the multiplication factor starts with warmup_factor in the first
epoch and then inreases linearly to reach 1. Defaults to "linear".
"""
multistep_lr = flow.optim.lr_scheduler.MultiStepLR(
optimizer, milestones=milestones, gamma=gamma
)
if warmup_iter == 0:
logger.warning("warmup iters equals to zero, return MultiStepLR")
return multistep_lr
warmup_multistep_lr = flow.optim.lr_scheduler.WarmUpLR(
multistep_lr,
warmup_factor=warmup_factor,
warmup_iters=warmup_iter,
warmup_method=warmup_method,
)
return warmup_multistep_lr
def WarmupExponentialLR(
optimizer: flow.optim.Optimizer,
max_iter: int,
gamma: float,
warmup_factor: float,
warmup_iter: int,
warmup_method: str = "linear",
):
"""Create a schedule with a learning rate that decreases following the values of
the Exponential function between the initial lr set in the optimizer to 0,
after a warmup period during which it increases linearly between 0 and the
initial lr set in the optimizer.
Args:
optimizer (flow.optim.Optimizer): Wrapped optimizer.
max_iter (int): Total training iters.
gamma (float): Multiplicative factor of learning rate decay.
warmup_factor (float): The warmup factor.
warmup_iter (int): The number of warmup steps.
warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
In linear mode, the multiplication factor starts with warmup_factor in the first epoch
and then inreases linearly to reach 1. Defaults to "linear".
"""
exponential_lr = flow.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma)
if warmup_iter == 0:
logger.warning("warmup iters equals to zero, return ExponentialLR")
return exponential_lr
warmup_exponential_lr = flow.optim.lr_scheduler.WarmUpLR(
exponential_lr,
warmup_factor=warmup_factor,
warmup_iters=warmup_iter,
warmup_method=warmup_method,
)
return warmup_exponential_lr
def WarmupPolynomialLR(
optimizer: flow.optim.Optimizer,
max_iter: int,
warmup_factor: float,
warmup_iter: int,
end_learning_rate: float = 0.0001,
power: float = 1.0,
cycle: bool = False,
warmup_method: str = "linear",
):
"""Create a schedule with a learning rate that decreases as a polynomial decay from
the initial lr set in the optimizer to end lr defined by `lr_end`,
after a warmup period during which it increases linearly from 0 to the
initial lr set in the optimizer.
Args:
optimizer (flow.optim.Optimizer): Wrapped optimizer.
max_iter (int): Total training iters.
warmup_factor (float): The warmup factor.
warmup_iter (int): The number of warmup steps.
end_learning_rate (float, optional): The final learning rate. Defaults to 0.0001.
power (float, optional): The power of polynomial. Defaults to 1.0.
cycle (bool, optional): If cycle is True, the scheduler will decay the learning rate
every decay steps. Defaults to False.
warmup_method (str, optional): The method of warmup, you can choose "linear" or "constant".
In linear mode, the multiplication factor starts with warmup_factor in the first
epoch and then inreases linearly to reach 1. Defaults to "linear".
"""
polynomial_lr = flow.optim.lr_scheduler.PolynomialLR(
optimizer,
decay_batch=max_iter,
end_learning_rate=end_learning_rate,
power=power,
cycle=cycle,
)
if warmup_iter == 0:
logger.warning("warmup iters equals to zero, return PolynomialLR")
return polynomial_lr
warmup_polynomial_lr = flow.optim.lr_scheduler.WarmUpLR(
polynomial_lr,
warmup_factor=warmup_factor,
warmup_iters=warmup_iter,
warmup_method=warmup_method,
)
return warmup_polynomial_lr
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .build import build_tokenizer
from .tokenization_bert import BertTokenizer
from .tokenization_roberta import RobertaTokenizer
from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_t5 import T5Tokenizer
from .tokenization_base import PreTrainedTokenizer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment