Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
BLOOM_oneflow
Commits
9fdb7dab
Commit
9fdb7dab
authored
Mar 30, 2023
by
yuguo960516
Browse files
bloom
parents
Pipeline
#150
failed with stages
in 0 seconds
Changes
332
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1561 additions
and
0 deletions
+1561
-0
projects/MOCOV3/configs/models/moco_vit_small_patch16.py
projects/MOCOV3/configs/models/moco_vit_small_patch16.py
+40
-0
projects/MOCOV3/configs/models/vit_base_patch16.py
projects/MOCOV3/configs/models/vit_base_patch16.py
+19
-0
projects/MOCOV3/configs/models/vit_small_patch16.py
projects/MOCOV3/configs/models/vit_small_patch16.py
+7
-0
projects/MOCOV3/modeling/moco.py
projects/MOCOV3/modeling/moco.py
+154
-0
projects/MOCOV3/modeling/vit.py
projects/MOCOV3/modeling/vit.py
+157
-0
projects/MOCOV3/pretrain_net.py
projects/MOCOV3/pretrain_net.py
+79
-0
projects/MOCOV3/trainer/moco_trainer.py
projects/MOCOV3/trainer/moco_trainer.py
+43
-0
projects/MOCOV3/transform/linear_prob_transform.py
projects/MOCOV3/transform/linear_prob_transform.py
+27
-0
projects/MOCOV3/transform/pretrain_transform.py
projects/MOCOV3/transform/pretrain_transform.py
+95
-0
projects/MOCOV3/utils/load_checkpoint.py
projects/MOCOV3/utils/load_checkpoint.py
+72
-0
projects/MOCOV3/utils/weight_convert.py
projects/MOCOV3/utils/weight_convert.py
+107
-0
projects/MT5/configs/mt5_base.py
projects/MT5/configs/mt5_base.py
+30
-0
projects/MT5/configs/mt5_large.py
projects/MT5/configs/mt5_large.py
+30
-0
projects/MT5/configs/mt5_pretrain.py
projects/MT5/configs/mt5_pretrain.py
+71
-0
projects/MT5/configs/mt5_small.py
projects/MT5/configs/mt5_small.py
+30
-0
projects/MT5/configs/t5_inference.py
projects/MT5/configs/t5_inference.py
+46
-0
projects/MT5/layers/attention_layer.py
projects/MT5/layers/attention_layer.py
+345
-0
projects/MT5/layers/embed_layer.py
projects/MT5/layers/embed_layer.py
+121
-0
projects/MT5/layers/lm_head_layer.py
projects/MT5/layers/lm_head_layer.py
+36
-0
projects/MT5/layers/logits_layer.py
projects/MT5/layers/logits_layer.py
+52
-0
No files found.
Too many changes to show.
To preserve performance only
332 of 332+
files are displayed.
Plain diff
Email patch
projects/MOCOV3/configs/models/moco_vit_small_patch16.py
0 → 100644
View file @
9fdb7dab
from
libai.config
import
LazyCall
from
modeling.moco
import
MoCo_ViT
from
modeling.vit
import
VisionTransformer
base_encoder
=
LazyCall
(
VisionTransformer
)(
img_size
=
224
,
patch_size
=
16
,
in_chans
=
3
,
embed_dim
=
384
,
depth
=
12
,
num_heads
=
12
,
mlp_ratio
=
4
,
drop_path_rate
=
0.0
,
global_pool
=
False
,
stop_grad_conv1
=
True
,
)
momentum_encoder
=
LazyCall
(
VisionTransformer
)(
img_size
=
224
,
patch_size
=
16
,
in_chans
=
3
,
embed_dim
=
384
,
depth
=
12
,
num_heads
=
12
,
mlp_ratio
=
4
,
drop_path_rate
=
0.0
,
global_pool
=
False
,
stop_grad_conv1
=
True
,
)
model
=
LazyCall
(
MoCo_ViT
)(
base_encoder
=
base_encoder
,
momentum_encoder
=
momentum_encoder
,
dim
=
256
,
mlp_dim
=
4096
,
T
=
0.2
,
m
=
0.99
,
)
projects/MOCOV3/configs/models/vit_base_patch16.py
0 → 100644
View file @
9fdb7dab
import
sys
sys
.
path
.
append
(
"projects/MOCOV3"
)
from
libai.config
import
LazyCall
# noqa: E402
from
modeling.vit
import
VisionTransformer
# noqa: E402
model
=
LazyCall
(
VisionTransformer
)(
img_size
=
224
,
patch_size
=
16
,
in_chans
=
3
,
embed_dim
=
768
,
depth
=
12
,
num_heads
=
12
,
mlp_ratio
=
4
,
drop_path_rate
=
0.1
,
global_pool
=
False
,
)
projects/MOCOV3/configs/models/vit_small_patch16.py
0 → 100644
View file @
9fdb7dab
from
.vit_base_patch16
import
model
model
.
embed_dim
=
384
model
.
depth
=
12
model
.
num_heads
=
12
model
.
drop_path_rate
=
0.0
projects/MOCOV3/modeling/moco.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------
# MoCo v3 Model
# References:
# moco-v3: https://github.com/facebookresearch/moco-v3/blob/main/moco/builder.py
# --------------------------------------------------------
import
math
import
oneflow
as
flow
import
oneflow.nn
as
nn
from
libai.layers
import
Linear
from
libai.utils.distributed
import
get_world_size
class
MoCo
(
nn
.
Module
):
"""
Build a MoCo model with a base encoder, a momentum encoder, and two MLPs
https://arxiv.org/abs/1911.05722
"""
def
__init__
(
self
,
base_encoder
,
momentum_encoder
,
dim
=
256
,
mlp_dim
=
4096
,
T
=
1.0
,
m
=
0.99
,
max_iter
=
300
):
"""
dim: feature dimension (default: 256)
mlp_dim: hidden dimension in MLPs (default: 4096)
T: softmax temperature (default: 1.0)
"""
super
(
MoCo
,
self
).
__init__
()
self
.
T
=
T
self
.
m
=
m
# build encoders
self
.
base_encoder
=
base_encoder
self
.
momentum_encoder
=
momentum_encoder
self
.
base_encoder
.
num_classes
=
dim
self
.
momentum_encoder
.
num_classes
=
dim
self
.
max_iter
=
max_iter
self
.
_build_projector_and_predictor_mlps
(
dim
,
mlp_dim
)
for
param_b
,
param_m
in
zip
(
self
.
base_encoder
.
parameters
(),
self
.
momentum_encoder
.
parameters
()
):
param_m
.
data
.
copy_
(
param_b
.
data
)
# initialize
param_m
.
requires_grad
=
False
# not update by gradient
def
_build_mlp
(
self
,
num_layers
,
input_dim
,
mlp_dim
,
output_dim
,
last_bn
=
True
):
mlp
=
[]
for
l
in
range
(
num_layers
):
dim1
=
input_dim
if
l
==
0
else
mlp_dim
dim2
=
output_dim
if
l
==
num_layers
-
1
else
mlp_dim
mlp
.
append
(
Linear
(
dim1
,
dim2
,
bias
=
False
))
# libai
if
l
<
num_layers
-
1
:
mlp
.
append
(
nn
.
BatchNorm1d
(
dim2
))
mlp
.
append
(
nn
.
ReLU
(
inplace
=
True
))
elif
last_bn
:
# follow SimCLR's design:
# https://github.com/google-research/simclr/blob/master/model_util.py#L157
# for simplicity, we further removed gamma in BN
# TODO: affine should be False (bug here)
mlp
.
append
(
nn
.
BatchNorm1d
(
dim2
,
affine
=
True
))
return
nn
.
Sequential
(
*
mlp
)
def
_build_projector_and_predictor_mlps
(
self
,
dim
,
mlp_dim
):
pass
@
flow
.
no_grad
()
def
_update_momentum_encoder
(
self
,
m
):
"""Momentum update of the momentum encoder"""
for
param_b
,
param_m
in
zip
(
self
.
base_encoder
.
parameters
(),
self
.
momentum_encoder
.
parameters
()
):
param_m
.
data
=
param_m
.
data
*
m
+
param_b
.
data
*
(
1.0
-
m
)
def
contrastive_loss
(
self
,
q
,
k
):
# normalize
q
=
nn
.
functional
.
normalize
(
q
,
dim
=
1
)
k
=
nn
.
functional
.
normalize
(
k
,
dim
=
1
)
# gather all targets
# k = concat_all_gather(k).to_global(sbp=q.sbp, placement=q.placement)
k
=
k
.
to_global
(
sbp
=
flow
.
sbp
.
broadcast
)
# Einstein sum is more intuitive
logits
=
flow
.
einsum
(
"nc,mc->nm"
,
q
,
k
)
/
self
.
T
N
=
logits
.
shape
[
0
]
//
get_world_size
()
labels
=
(
flow
.
arange
(
N
,
dtype
=
flow
.
long
)
+
N
*
flow
.
env
.
get_rank
()).
to_global
(
sbp
=
flow
.
sbp
.
split
(
0
),
placement
=
logits
.
placement
)
return
nn
.
CrossEntropyLoss
()(
logits
,
labels
)
*
(
2
*
self
.
T
)
def
adjust_moco_momentum
(
self
,
cu_iter
,
m
):
"""Adjust moco momentum based on current epoch"""
m
=
1.0
-
0.5
*
(
1.0
+
math
.
cos
(
math
.
pi
*
cu_iter
/
self
.
max_iter
))
*
(
1.0
-
m
)
return
m
def
forward
(
self
,
images
,
labels
=
None
,
cu_iter
=
0
,
m
=
0.99
):
if
self
.
training
:
[
x1
,
x2
]
=
flow
.
chunk
(
images
,
2
,
dim
=
1
)
# compute features
q1
=
self
.
predictor
(
self
.
base_encoder
(
x1
)[
"prediction_scores"
])
q2
=
self
.
predictor
(
self
.
base_encoder
(
x2
)[
"prediction_scores"
])
m
=
self
.
adjust_moco_momentum
(
cu_iter
,
m
)
# update the moco_momentum
with
flow
.
no_grad
():
# no gradient
self
.
_update_momentum_encoder
(
m
)
# update the momentum encoder
# compute momentum features as targets
k1
=
self
.
momentum_encoder
(
x1
)[
"prediction_scores"
]
k2
=
self
.
momentum_encoder
(
x2
)[
"prediction_scores"
]
return
(
{
"losses"
:
self
.
contrastive_loss
(
q1
,
k2
)
+
self
.
contrastive_loss
(
q2
,
k1
)},
{
"m"
:
m
},
)
else
:
return
self
.
base_encoder
(
images
)
class
MoCo_ViT
(
MoCo
):
def
_build_projector_and_predictor_mlps
(
self
,
dim
,
mlp_dim
):
hidden_dim
=
self
.
base_encoder
.
head
.
weight
.
shape
[
1
]
# projectors
self
.
base_encoder
.
head
=
self
.
_build_mlp
(
3
,
hidden_dim
,
mlp_dim
,
dim
)
self
.
momentum_encoder
.
head
=
self
.
_build_mlp
(
3
,
hidden_dim
,
mlp_dim
,
dim
)
# predictor
self
.
predictor
=
self
.
_build_mlp
(
2
,
dim
,
mlp_dim
,
dim
)
projects/MOCOV3/modeling/vit.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# --------------------------------------------------------
# ViT Model
# References:
# moco-v3: https://github.com/facebookresearch/moco-v3/blob/main/vits.py
# --------------------------------------------------------
import
math
from
functools
import
reduce
from
operator
import
mul
import
oneflow
as
flow
import
oneflow.nn
as
nn
from
flowvision.layers.weight_init
import
trunc_normal_
from
utils.load_checkpoint
import
load_checkpoint
from
libai.layers
import
Linear
,
PatchEmbedding
from
libai.models
import
vision_transformer
class
VisionTransformer
(
vision_transformer
.
VisionTransformer
):
"""Vision Transformer for MOCO
LiBai impl of: `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
- https://arxiv.org/abs/2010.11929
"""
def
__init__
(
self
,
img_size
=
224
,
patch_size
=
16
,
in_chans
=
3
,
embed_dim
=
768
,
depth
=
12
,
num_heads
=
12
,
mlp_ratio
=
4.0
,
drop_rate
=
0.0
,
attn_drop_rate
=
0.0
,
drop_path_rate
=
0.0
,
global_pool
=
False
,
num_classes
=
1000
,
loss_func
=
None
,
linear_prob
=
None
,
weight_style
=
"pytorch"
,
stop_grad_conv1
=
False
,
):
super
(
VisionTransformer
,
self
).
__init__
(
img_size
=
img_size
,
patch_size
=
patch_size
,
in_chans
=
in_chans
,
embed_dim
=
embed_dim
,
depth
=
depth
,
num_heads
=
num_heads
,
mlp_ratio
=
mlp_ratio
,
drop_rate
=
drop_rate
,
attn_drop_rate
=
attn_drop_rate
,
drop_path_rate
=
drop_path_rate
,
num_classes
=
num_classes
,
loss_func
=
loss_func
,
)
self
.
global_pool
=
global_pool
# weight init
if
linear_prob
:
load_checkpoint
(
self
,
linear_prob
,
weight_style
,
num_heads
,
embed_dim
)
self
.
head
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
0.01
)
self
.
head
.
bias
.
data
.
zeros_
()
else
:
trunc_normal_
(
self
.
pos_embed
,
std
=
0.02
)
trunc_normal_
(
self
.
cls_token
,
std
=
0.02
)
self
.
apply
(
self
.
_init_weights
)
self
.
stop_grad_conv1
=
stop_grad_conv1
self
.
embed_dim
=
embed_dim
self
.
initialization
()
def
initialization
(
self
):
# Use fixed 2D sin-cos position embedding
self
.
build_2d_sincos_position_embedding
()
# weight initialization
for
name
,
m
in
self
.
named_modules
():
if
isinstance
(
m
,
Linear
):
if
"query_key_value"
in
name
:
val
=
math
.
sqrt
(
6.0
/
float
(
m
.
weight
.
shape
[
0
]
//
3
+
m
.
weight
.
shape
[
1
]))
nn
.
init
.
uniform_
(
m
.
weight
,
-
val
,
val
)
else
:
nn
.
init
.
xavier_uniform_
(
m
.
weight
)
nn
.
init
.
zeros_
(
m
.
bias
)
nn
.
init
.
normal_
(
self
.
cls_token
,
std
=
1e-6
)
if
isinstance
(
self
.
patch_embed
,
PatchEmbedding
):
# xavier_uniform initialization
val
=
math
.
sqrt
(
6.0
/
float
(
3
*
reduce
(
mul
,
self
.
patch_embed
.
patch_size
,
1
)
+
self
.
embed_dim
)
)
nn
.
init
.
uniform_
(
self
.
patch_embed
.
proj
.
weight
,
-
val
,
val
)
nn
.
init
.
zeros_
(
self
.
patch_embed
.
proj
.
bias
)
if
self
.
stop_grad_conv1
:
self
.
patch_embed
.
proj
.
weight
.
requires_grad
=
False
self
.
patch_embed
.
proj
.
bias
.
requires_grad
=
False
def
build_2d_sincos_position_embedding
(
self
,
temperature
=
10000.0
):
sbp
=
self
.
pos_embed
.
sbp
placement
=
self
.
pos_embed
.
placement
h
,
w
=
self
.
patch_embed
.
grid_size
grid_w
=
flow
.
arange
(
w
,
dtype
=
flow
.
float32
).
to_global
(
sbp
=
sbp
,
placement
=
placement
)
grid_h
=
flow
.
arange
(
h
,
dtype
=
flow
.
float32
).
to_global
(
sbp
=
sbp
,
placement
=
placement
)
grid_w
,
grid_h
=
flow
.
meshgrid
(
grid_w
,
grid_h
)
assert
(
self
.
embed_dim
%
4
==
0
),
"Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
pos_dim
=
self
.
embed_dim
//
4
omega
=
(
flow
.
arange
(
pos_dim
,
dtype
=
flow
.
float32
)
/
pos_dim
).
to_global
(
sbp
=
sbp
,
placement
=
placement
)
omega
=
1.0
/
flow
.
tensor
(
temperature
).
to_global
(
sbp
=
sbp
,
placement
=
placement
)
**
omega
out_w
=
flow
.
einsum
(
"m,d->md"
,
grid_w
.
flatten
(),
omega
)
out_h
=
flow
.
einsum
(
"m,d->md"
,
grid_h
.
flatten
(),
omega
)
pos_emb
=
flow
.
cat
(
[
flow
.
sin
(
out_w
),
flow
.
cos
(
out_w
),
flow
.
sin
(
out_h
),
flow
.
cos
(
out_h
)],
dim
=
1
)[
None
,
:,
:]
pe_token
=
flow
.
zeros
([
1
,
1
,
self
.
embed_dim
],
dtype
=
flow
.
float32
).
to_global
(
sbp
=
sbp
,
placement
=
placement
)
self
.
pos_embed
=
nn
.
Parameter
(
flow
.
cat
([
pe_token
,
pos_emb
],
dim
=
1
))
self
.
pos_embed
.
requires_grad
=
False
def
forward_head
(
self
,
x
):
if
self
.
global_pool
:
x
=
x
[:,
1
:,
:].
mean
(
dim
=
1
)
# global pool without cls token
outcome
=
self
.
norm
(
x
)
outcome
=
self
.
head
(
outcome
)
else
:
x
=
self
.
norm
(
x
)
outcome
=
x
[:,
0
]
outcome
=
self
.
head
(
outcome
)
return
outcome
projects/MOCOV3/pretrain_net.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
sys
from
trainer.moco_trainer
import
MoCoEagerTrainer
from
libai.config
import
LazyConfig
,
default_argument_parser
,
try_get_key
from
libai.engine
import
DefaultTrainer
,
default_setup
from
libai.utils.checkpoint
import
Checkpointer
sys
.
path
.
append
(
"."
)
logger
=
logging
.
getLogger
(
__name__
)
class
MoCoPretrainingTrainer
(
DefaultTrainer
):
def
__init__
(
self
,
cfg
):
super
().
__init__
(
cfg
)
self
.
model
.
max_iter
=
cfg
.
train
.
train_iter
self
.
_trainer
=
MoCoEagerTrainer
(
self
.
model
,
self
.
train_loader
,
self
.
optimizer
,
cfg
.
train
.
num_accumulation_steps
)
def
main
(
args
):
cfg
=
LazyConfig
.
load
(
args
.
config_file
)
cfg
=
LazyConfig
.
apply_overrides
(
cfg
,
args
.
opts
)
if
try_get_key
(
cfg
,
"graph.enabled"
)
is
True
:
raise
NotImplementedError
(
"LiBai MOCO only support eager global mode now, please set cfg.graph.enabled=False"
)
default_setup
(
cfg
,
args
)
if
args
.
fast_dev_run
:
cfg
.
train
.
train_epoch
=
0
cfg
.
train
.
train_iter
=
20
cfg
.
train
.
eval_period
=
10
cfg
.
train
.
log_period
=
1
if
args
.
eval_only
:
tokenizer
=
None
if
try_get_key
(
cfg
,
"tokenization.setup"
,
default
=
False
):
tokenizer
=
MoCoPretrainingTrainer
.
build_tokenizer
(
cfg
)
model
=
MoCoPretrainingTrainer
.
build_model
(
cfg
)
Checkpointer
(
model
,
save_dir
=
cfg
.
train
.
output_dir
).
resume_or_load
(
cfg
.
train
.
load_weight
,
resume
=
args
.
resume
)
if
try_get_key
(
cfg
,
"train.graph.enabled"
,
default
=
False
):
model
=
MoCoPretrainingTrainer
.
build_graph
(
cfg
,
model
,
is_train
=
False
)
test_loader
=
MoCoPretrainingTrainer
.
build_test_loader
(
cfg
,
tokenizer
)
_
=
MoCoPretrainingTrainer
.
test
(
cfg
,
test_loader
,
model
)
return
trainer
=
MoCoPretrainingTrainer
(
cfg
)
return
trainer
.
train
()
if
__name__
==
"__main__"
:
args
=
default_argument_parser
().
parse_args
()
main
(
args
)
projects/MOCOV3/trainer/moco_trainer.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
time
from
typing
import
Callable
from
libai.engine.trainer
import
EagerTrainer
class
MoCoEagerTrainer
(
EagerTrainer
):
def
run_step
(
self
,
get_batch
:
Callable
):
assert
self
.
model
.
training
,
"[SimpleTrainer] model was changed to eval mode!"
start
=
time
.
perf_counter
()
# If you want to do something with the data, you can wrap the dataloader.
data
=
next
(
self
.
_data_loader_iter
)
data
=
get_batch
(
data
,
getattr
(
self
.
data_loader
,
"mixup_func"
,
None
))
data_time
=
time
.
perf_counter
()
-
start
# update the moco_momentum per step
loss_dict
,
m_dict
=
self
.
model
(
**
data
,
cu_iter
=
self
.
iter
,
m
=
self
.
model
.
m
)
self
.
model
.
m
=
m_dict
[
"m"
]
losses
=
sum
(
loss_dict
.
values
())
/
self
.
grad_acc_steps
losses
.
backward
()
self
.
write_metrics
(
loss_dict
,
data_time
)
if
(
self
.
iter
+
1
)
%
self
.
grad_acc_steps
==
0
:
self
.
optimizer
.
step
()
self
.
optimizer
.
zero_grad
()
projects/MOCOV3/transform/linear_prob_transform.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
flowvision
import
transforms
from
flowvision.data
import
IMAGENET_DEFAULT_MEAN
,
IMAGENET_DEFAULT_STD
from
libai.config
import
LazyCall
train_aug
=
[
LazyCall
(
transforms
.
RandomResizedCrop
)(
size
=
224
),
LazyCall
(
transforms
.
RandomHorizontalFlip
)(),
LazyCall
(
transforms
.
ToTensor
)(),
LazyCall
(
transforms
.
Normalize
)(
mean
=
IMAGENET_DEFAULT_MEAN
,
std
=
IMAGENET_DEFAULT_STD
),
]
projects/MOCOV3/transform/pretrain_transform.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
random
import
oneflow
as
flow
from
flowvision
import
transforms
from
flowvision.data
import
IMAGENET_DEFAULT_MEAN
,
IMAGENET_DEFAULT_STD
from
PIL
import
ImageFilter
,
ImageOps
from
libai.config
import
LazyCall
class
GaussianBlur
(
object
):
"""Gaussian blur augmentation from SimCLR: https://arxiv.org/abs/2002.05709"""
def
__init__
(
self
,
sigma
=
[
0.1
,
2.0
]):
self
.
sigma
=
sigma
def
__call__
(
self
,
x
):
sigma
=
random
.
uniform
(
self
.
sigma
[
0
],
self
.
sigma
[
1
])
x
=
x
.
filter
(
ImageFilter
.
GaussianBlur
(
radius
=
sigma
))
return
x
class
Solarize
(
object
):
"""Solarize augmentation from BYOL: https://arxiv.org/abs/2006.07733"""
def
__call__
(
self
,
x
):
return
ImageOps
.
solarize
(
x
)
# follow BYOL's augmentation recipe: https://arxiv.org/abs/2006.07733
augmentation1
=
[
LazyCall
(
transforms
.
RandomResizedCrop
)(
size
=
224
,
scale
=
(
0.2
,
1.0
)),
LazyCall
(
transforms
.
RandomApply
)(
transforms
=
[
LazyCall
(
transforms
.
ColorJitter
)(
brightness
=
0.4
,
contrast
=
0.4
,
saturation
=
0.2
,
hue
=
0.1
)
# not strengthened
],
p
=
0.8
,
),
# TODO: Add RandomGrayscale
# LazyCall(transforms.RandomGrayscale)(p=0.2),
LazyCall
(
transforms
.
RandomApply
)(
transforms
=
[
LazyCall
(
GaussianBlur
)(
sigma
=
[
0.1
,
2.0
])],
p
=
1.0
),
LazyCall
(
transforms
.
RandomHorizontalFlip
)(),
LazyCall
(
transforms
.
ToTensor
)(),
LazyCall
(
transforms
.
Normalize
)(
mean
=
IMAGENET_DEFAULT_MEAN
,
std
=
IMAGENET_DEFAULT_STD
),
]
augmentation2
=
[
LazyCall
(
transforms
.
RandomResizedCrop
)(
size
=
224
,
scale
=
(
0.2
,
1.0
)),
LazyCall
(
transforms
.
RandomApply
)(
transforms
=
[
LazyCall
(
transforms
.
ColorJitter
)(
brightness
=
0.4
,
contrast
=
0.4
,
saturation
=
0.2
,
hue
=
0.1
)
# not strengthened
],
p
=
0.8
,
),
# TODO: Add RandomGrayscale
# LazyCall(transforms.RandomGrayscale)(p=0.2),
LazyCall
(
transforms
.
RandomApply
)(
transforms
=
[
LazyCall
(
GaussianBlur
)(
sigma
=
[
0.1
,
2.0
])],
p
=
1.0
),
LazyCall
(
transforms
.
RandomApply
)(
transforms
=
[
LazyCall
(
Solarize
)()],
p
=
0.2
),
LazyCall
(
transforms
.
RandomHorizontalFlip
)(),
LazyCall
(
transforms
.
ToTensor
)(),
LazyCall
(
transforms
.
Normalize
)(
mean
=
IMAGENET_DEFAULT_MEAN
,
std
=
IMAGENET_DEFAULT_STD
),
]
class
TwoCropsTransform
:
"""Take two random crops of one image"""
def
__init__
(
self
,
base_transform1
,
base_transform2
):
self
.
base_transform1
=
base_transform1
self
.
base_transform2
=
base_transform2
def
__call__
(
self
,
x
):
im1
=
self
.
base_transform1
(
x
)
im2
=
self
.
base_transform2
(
x
)
return
flow
.
cat
((
im1
,
im2
),
dim
=
0
)
projects/MOCOV3/utils/load_checkpoint.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
from
utils.weight_convert
import
load_torch_checkpoint_linear_prob
from
libai.utils.checkpoint
import
(
Checkpointer
,
get_missing_parameters_message
,
get_unexpected_parameters_message
,
)
logger
=
logging
.
getLogger
(
"libai."
+
__name__
)
def
load_checkpoint
(
model
,
path
,
weight_style
,
num_heads
,
embed_dim
):
linear_keyword
=
"head"
for
name
,
param
in
model
.
named_parameters
():
if
name
not
in
[
"%s.weight"
%
linear_keyword
,
"%s.bias"
%
linear_keyword
]:
param
.
requires_grad
=
False
assert
weight_style
in
[
"pytorch"
,
"oneflow"
]
if
weight_style
==
"pytorch"
:
params
=
load_torch_checkpoint_linear_prob
(
num_heads
,
embed_dim
,
path
=
path
)
else
:
params
=
Checkpointer
(
model
).
load
(
path
)
model_state_dict
=
model
.
state_dict
()
# check the incorrect shape and unexpected keys
incorrect_shapes
=
[]
unexpected_keys
=
[]
for
k
in
list
(
params
.
keys
()):
if
k
in
model_state_dict
:
shape_model
=
tuple
(
model_state_dict
[
k
].
shape
)
shape_ckp
=
tuple
(
params
[
k
].
shape
)
if
shape_model
!=
shape_ckp
:
incorrect_shapes
.
append
((
k
,
shape_ckp
,
shape_model
))
params
.
pop
(
k
)
model_state_dict
.
pop
(
k
)
else
:
unexpected_keys
.
append
(
k
)
missing_keys
=
list
(
model_state_dict
.
keys
())
for
k
,
shape_checkpoint
,
shape_model
in
incorrect_shapes
:
logger
.
warning
(
"Skip loading parameter '{}' to the model due to incompatible "
"shapes: {} in the checkpoint but {} in the "
"model! You might want to double check if this is expected."
.
format
(
k
,
shape_checkpoint
,
shape_model
)
)
if
missing_keys
:
logger
.
info
(
get_missing_parameters_message
(
missing_keys
))
if
unexpected_keys
:
logger
.
info
(
get_unexpected_parameters_message
(
unexpected_keys
))
model
.
load_state_dict
(
params
,
strict
=
False
)
projects/MOCOV3/utils/weight_convert.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
import
oneflow
as
flow
import
torch
logger
=
logging
.
getLogger
(
__name__
)
def
convert_qkv_weight
(
value
,
num_heads
,
hidden_size
):
"""
convert qkv.weight to be compatible with LiBai transformer layer
Args:
cfg: config file
value: qkv.weight in the loaded checkpoint
"""
head_size
=
int
(
hidden_size
/
num_heads
)
qkv_weight
=
(
value
.
view
(
3
,
num_heads
,
head_size
,
hidden_size
)
.
permute
(
1
,
0
,
2
,
3
)
.
contiguous
()
.
view
(
hidden_size
*
3
,
hidden_size
)
)
return
qkv_weight
def
convert_qkv_bias
(
value
,
num_heads
,
hidden_size
):
"""
convert qkv.bias to be compatible with LiBai transformer layer
Args:
cfg: config file
value: qkv.bias in the loaded checkpoint
"""
head_size
=
int
(
hidden_size
/
num_heads
)
qkv_bias
=
(
value
.
view
(
3
,
num_heads
,
head_size
).
permute
(
1
,
0
,
2
).
contiguous
().
view
(
hidden_size
*
3
)
)
return
qkv_bias
def
filter_keys
(
key
,
value
,
num_heads
,
hidden_size
):
"""Filtering the state_dict keys and values to match LiBai's MOCOV3 model"""
if
"norm1"
in
key
:
key
=
key
.
replace
(
"norm1"
,
"input_layernorm"
)
elif
"attn.qkv"
in
key
:
key
=
key
.
replace
(
"attn.qkv"
,
"self_attention.query_key_value"
)
if
"weight"
in
key
:
value
=
convert_qkv_weight
(
value
,
num_heads
,
hidden_size
)
if
"bias"
in
key
:
value
=
convert_qkv_bias
(
value
,
num_heads
,
hidden_size
)
elif
"attn.proj"
in
key
:
key
=
key
.
replace
(
"attn.proj"
,
"self_attention.dense"
)
elif
"norm2"
in
key
:
key
=
key
.
replace
(
"norm2"
,
"post_attention_layernorm"
)
elif
"mlp.fc1"
in
key
:
key
=
key
.
replace
(
"mlp.fc1"
,
"mlp.dense_h_to_4h"
)
elif
"mlp.fc2"
in
key
:
key
=
key
.
replace
(
"mlp.fc2"
,
"mlp.dense_4h_to_h"
)
elif
"fc_norm"
in
key
:
key
=
key
.
replace
(
"fc_norm"
,
"norm"
)
return
key
,
value
def
load_torch_checkpoint_linear_prob
(
num_heads
,
hidden_size
,
path
=
"projects/MOCOV3/output/vit-b-300ep.pth.tar"
,
linear_keyword
=
"head"
):
"""Load checkpoint from the given torch weights.
Torch weight from: xxx
"""
torch_dict
=
torch
.
load
(
path
,
map_location
=
"cpu"
)[
"state_dict"
]
parameters
=
torch_dict
new_parameters
=
dict
()
for
key
,
value
in
parameters
.
items
():
if
"num_batches_tracked"
not
in
key
:
if
key
.
startswith
(
"module.base_encoder"
)
and
not
key
.
startswith
(
"module.base_encoder.%s"
%
linear_keyword
):
# to global tensor
key
,
val
=
filter_keys
(
key
,
value
,
num_heads
,
hidden_size
)
val
=
val
.
detach
().
cpu
().
numpy
()
val
=
flow
.
tensor
(
val
).
to_global
(
sbp
=
flow
.
sbp
.
broadcast
,
placement
=
flow
.
placement
(
"cuda"
,
{
0
:
range
(
1
)})
)
new_parameters
[
key
[
len
(
"module.base_encoder."
)
:]]
=
val
return
new_parameters
projects/MT5/configs/mt5_base.py
0 → 100644
View file @
9fdb7dab
from
omegaconf
import
DictConfig
from
libai.config
import
LazyCall
from
projects.MT5.mt5_model
import
MT5Model
,
MT5ForPreTraining
cfg
=
dict
(
vocab_size
=
250112
,
hidden_size
=
768
,
hidden_layers
=
12
,
num_attention_heads
=
12
,
head_size
=
64
,
intermediate_size
=
2048
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
embedding_dropout_prob
=
0.1
,
relative_attention_num_buckets
=
32
,
initializer_range
=
1.0
,
layernorm_eps
=
1e-06
,
amp_enabled
=
False
,
model_type
=
"mt5"
,
eos_token_id
=
1
,
padding_idx
=
0
,
is_encoder_decoder
=
True
,
tie_word_embeddings
=
True
,
)
cfg
=
DictConfig
(
cfg
)
mt5_model
=
LazyCall
(
MT5Model
)(
cfg
=
cfg
)
pretrain_model
=
LazyCall
(
MT5ForPreTraining
)(
cfg
=
cfg
)
projects/MT5/configs/mt5_large.py
0 → 100644
View file @
9fdb7dab
from
omegaconf
import
DictConfig
from
libai.config
import
LazyCall
from
projects.MT5.mt5_model
import
MT5Model
,
MT5ForPreTraining
cfg
=
dict
(
vocab_size
=
250112
,
hidden_size
=
1024
,
hidden_layers
=
24
,
num_attention_heads
=
16
,
head_size
=
64
,
intermediate_size
=
2816
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
embedding_dropout_prob
=
0.1
,
relative_attention_num_buckets
=
32
,
initializer_range
=
1.0
,
layernorm_eps
=
1e-06
,
amp_enabled
=
False
,
model_type
=
"mt5"
,
eos_token_id
=
1
,
padding_idx
=
0
,
is_encoder_decoder
=
True
,
tie_word_embeddings
=
False
,
)
cfg
=
DictConfig
(
cfg
)
mt5_model
=
LazyCall
(
MT5Model
)(
cfg
=
cfg
)
pretrain_model
=
LazyCall
(
MT5ForPreTraining
)(
cfg
=
cfg
)
projects/MT5/configs/mt5_pretrain.py
0 → 100644
View file @
9fdb7dab
from
libai.config
import
LazyCall
from
libai.evaluation
import
PPLEvaluator
from
libai.scheduler
import
WarmupExponentialLR
from
configs.common.train
import
train
from
configs.common.data.t5_dataset
import
dataloader
,
tokenization
from
configs.common.models.graph
import
graph
from
configs.common.optim
import
optim
from
projects.MT5.configs.mt5_base
import
pretrain_model
as
model
vocab_file
=
"./data_test/bert_data/bert-base-chinese-vocab.txt"
data_prefix
=
"./data_test/bert_data/loss_compara_content_sentence"
tokenization
.
tokenizer
.
vocab_file
=
vocab_file
dataloader
.
train
.
dataset
[
0
].
data_prefix
=
data_prefix
dataloader
.
train
.
dataset
[
0
].
indexed_dataset
.
data_prefix
=
data_prefix
# model config
model
.
cfg
.
hidden_size
=
768
model
.
cfg
.
hidden_layers
=
12
model
.
cfg
.
num_attention_heads
=
12
model
.
cfg
.
head_size
=
64
model
.
cfg
.
intermediate_size
=
2048
model
.
cfg
.
model_type
=
"mt5"
model
.
cfg
.
hidden_dropout_prob
=
0.0
model
.
cfg
.
attention_probs_dropout_prob
=
0.0
model
.
cfg
.
embedding_dropout_prob
=
0.0
model
.
cfg
.
vocab_size
=
30522
model
.
cfg
.
padding_idx
=
0
model
.
cfg
.
tie_word_embeddings
=
False
model
.
cfg
.
is_encoder_decoder
=
False
model
.
cfg
.
amp_enabled
=
True
model
.
cfg
.
initializer_range
=
0.02
model
.
cfg
.
pretrained_model_path
=
None
train
.
update
(
dict
(
output_dir
=
"projects/MT5/output/mt5_output"
,
train_micro_batch_size
=
4
,
train_epoch
=
1
,
train_iter
=
24000
,
log_period
=
10
,
amp
=
dict
(
enabled
=
True
),
warmup_ratio
=
1
/
24
,
# checkpointer=dict(period=10, max_to_keep=20),
input_placement_device
=
"cpu"
,
dist
=
dict
(
data_parallel_size
=
2
,
tensor_parallel_size
=
2
,
pipeline_parallel_size
=
1
,
pipeline_num_layers
=
2
*
model
.
cfg
.
hidden_layers
,
),
scheduler
=
LazyCall
(
WarmupExponentialLR
)(
warmup_factor
=
0.001
,
gamma
=
1.0
,
warmup_method
=
"linear"
,
warmup_iter
=
0.0
,
),
evaluation
=
dict
(
evaluator
=
LazyCall
(
PPLEvaluator
)(),
enabled
=
True
,
eval_iter
=
1e5
,
eval_period
=
5000
,
),
)
)
train
.
zero_optimization
.
enabled
=
True
train
.
zero_optimization
.
stage
=
2
train
.
activation_checkpoint
.
enabled
=
False
train
.
num_accumulation_steps
=
8
projects/MT5/configs/mt5_small.py
0 → 100644
View file @
9fdb7dab
from
omegaconf
import
DictConfig
from
libai.config
import
LazyCall
from
projects.MT5.mt5_model
import
MT5Model
,
MT5ForPreTraining
cfg
=
dict
(
vocab_size
=
250112
,
hidden_size
=
512
,
hidden_layers
=
8
,
num_attention_heads
=
6
,
head_size
=
64
,
intermediate_size
=
1024
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
embedding_dropout_prob
=
0.1
,
relative_attention_num_buckets
=
32
,
initializer_range
=
1.0
,
layernorm_eps
=
1e-06
,
amp_enabled
=
False
,
model_type
=
"mt5"
,
eos_token_id
=
1
,
padding_idx
=
0
,
is_encoder_decoder
=
True
,
tie_word_embeddings
=
False
,
)
cfg
=
DictConfig
(
cfg
)
mt5_model
=
LazyCall
(
MT5Model
)(
cfg
=
cfg
)
pretrain_model
=
LazyCall
(
MT5ForPreTraining
)(
cfg
=
cfg
)
projects/MT5/configs/t5_inference.py
0 → 100644
View file @
9fdb7dab
from
.mt5_base
import
cfg
from
libai.config
import
LazyCall
from
libai.tokenizer
import
T5Tokenizer
from
projects.MT5.mt5_model
import
MT5Model
,
MT5ForPreTraining
from
configs.common.train
import
train
from
configs.common.data.t5_dataset
import
tokenization
cfg
.
update
(
model_type
=
"t5"
,
is_encoder_decoder
=
True
,
max_length
=
20
,
min_length
=
0
,
do_sample
=
False
,
early_stopping
=
False
,
num_beams
=
1
,
num_beam_groups
=
1
,
diversity_penalty
=
0.0
,
temperature
=
1.0
,
top_k
=
50
,
top_p
=
1.0
,
typical_p
=
1.0
,
repetition_penalty
=
1.0
,
length_penalty
=
1.0
,
no_repeat_ngram_size
=
0
,
encoder_no_repeat_ngram_size
=
0
,
num_return_sequences
=
1
,
chunk_size_feed_forward
=
0
,
output_scores
=
False
,
forced_bos_token_id
=
None
,
forced_eos_token_id
=
None
,
remove_invalid_values
=
False
,
exponential_decay_length_penalty
=
None
,
use_cache
=
True
,
# Tokenizer
pad_token_id
=
0
,
eos_token_id
=
1
,
bos_token_id
=
None
,
sep_token_id
=
None
,
decoder_start_token_id
=
0
,
)
model
=
LazyCall
(
MT5Model
)(
cfg
=
cfg
)
tokenization
.
tokenizer
=
LazyCall
(
T5Tokenizer
)(
vocab_file
=
"/path/to/spiece.model"
,
add_bos_token
=
True
,
)
projects/MT5/layers/attention_layer.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
from
typing
import
Tuple
import
oneflow
as
flow
from
oneflow
import
nn
from
libai.layers.linear
import
Linear
from
libai.utils
import
distributed
as
dist
from
projects.MT5.layers.embed_layer
import
Embedding
class
MultiheadAttention
(
nn
.
Module
):
"""Multi-head attention layer, support self attention and cross attention.
Args:
hidden_size: size of hidden state.
num_attention_heads: number of attention heads.
is_cross_attention: used to specify whether it is self attention or cross attention.
Defaults to False.
attention_dropout_prob: dropout probability of attention weights.
Defaults to 0.0.
output_dropout_prob: dropout probability of output. Defaults to 0.0.
init_method: method to initialize the input layer weights.
Defaults to ``init.xavier_normal_``.
output_layer_init_method: method to initialize the output layer weights.
If None, use ``init_method``.
layer_idx: a layer_idx sign which determines the placements.
It will be used in pipeline parallelism. Defaults to 0.
"""
def
__init__
(
self
,
hidden_size
,
num_attention_heads
,
head_size
,
relative_attention_num_buckets
,
is_cross_attention
=
False
,
attention_dropout_prob
=
0.0
,
output_dropout_prob
=
0.0
,
init_method
=
nn
.
init
.
xavier_normal_
,
output_layer_init_method
=
None
,
padding_idx
=
None
,
*
,
layer_idx
=
0
,
has_relative_attention_bias
=
False
,
is_decoder
=
False
,
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
relative_attention_num_buckets
=
relative_attention_num_buckets
self
.
has_relative_attention_bias
=
has_relative_attention_bias
self
.
is_decoder
=
is_decoder
self
.
attention_dropout_prob
=
attention_dropout_prob
if
output_layer_init_method
is
None
:
output_layer_init_method
=
init_method
self
.
num_heads
=
num_attention_heads
self
.
head_size
=
head_size
self
.
dropout
=
nn
.
Dropout
(
p
=
attention_dropout_prob
)
self
.
norm_factor
=
1.0
/
math
.
sqrt
(
float
(
self
.
head_size
))
self
.
is_cross_attention
=
is_cross_attention
self
.
output_dropout
=
nn
.
Dropout
(
p
=
output_dropout_prob
)
if
self
.
is_cross_attention
:
self
.
query
=
Linear
(
self
.
hidden_size
,
self
.
num_heads
*
self
.
head_size
,
bias
=
False
,
parallel
=
"col"
,
init_method
=
init_method
,
layer_idx
=
layer_idx
,
)
self
.
key_value
=
Linear
(
self
.
hidden_size
,
self
.
num_heads
*
self
.
head_size
*
2
,
bias
=
False
,
parallel
=
"col"
,
init_method
=
init_method
,
layer_idx
=
layer_idx
,
)
else
:
self
.
query_key_value
=
Linear
(
self
.
hidden_size
,
self
.
num_heads
*
self
.
head_size
*
3
,
bias
=
False
,
parallel
=
"col"
,
init_method
=
init_method
,
layer_idx
=
layer_idx
,
)
self
.
dense
=
Linear
(
self
.
num_heads
*
self
.
head_size
,
self
.
hidden_size
,
bias
=
False
,
parallel
=
"row"
,
init_method
=
output_layer_init_method
,
skip_bias_add
=
False
,
layer_idx
=
layer_idx
,
)
if
self
.
has_relative_attention_bias
:
self
.
relative_attention_bias
=
Embedding
(
self
.
relative_attention_num_buckets
,
self
.
num_heads
,
padding_idx
=
padding_idx
,
layer_idx
=
layer_idx
,
)
def
forward
(
self
,
hidden_states
:
flow
.
Tensor
,
encoder_states
:
flow
.
Tensor
=
None
,
attention_mask
:
flow
.
Tensor
=
None
,
past_key_value
:
Tuple
[
flow
.
Tensor
,
flow
.
Tensor
]
=
None
,
use_cache
:
bool
=
False
,
position_bias
=
None
,
query_length
=
None
,
):
"""
Args:
hidden_states (flow.Tensor): shape is [bsz, tgt_len, hidden_size].
encoder_states (flow.Tensor, optional): shape is [bsz, src_len, hidden_size].
Defaults to None.
attention_mask (flow.Tensor, optional): shape is [bsz, 1, tgt_len, src_len].
It should be the combination of padding mask and casual mask.
It is the padding mask of source input when used with self-attention in encoder.
And it is the combination of padding mask of target input and casual mask when
used with self-attention in decoder. It is the padding mask of source input when
used with cross-attention in decoder.
Defaults to None.
past_key_value (Tuple[flow.Tensor, flow.Tensor], optional): tuple of key and value,
each shape is [bsz, num_heads, src_len, head_size]. Defaults to None.
use_cache (bool, optional): it will be set to True, when the model is in the inference
phase and used for incremental decoding. Defaults to False.
"""
if
encoder_states
is
not
None
:
encoder_states
=
encoder_states
.
to_global
(
placement
=
hidden_states
.
placement
)
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
.
to_global
(
placement
=
hidden_states
.
placement
)
# hidden_states shape: [seq_len, batch_size, hidden_size]
real_seq_length
,
bsz
=
hidden_states
.
size
()[:
2
]
if
past_key_value
is
not
None
:
assert
(
len
(
past_key_value
)
==
2
),
"past_key_value should have 2 past states: keys and values."
f
"Got
{
len
(
past_key_value
)
}
past states.
\n
"
real_seq_length
+=
past_key_value
[
0
].
shape
[
2
]
if
query_length
is
None
else
query_length
key_length
=
real_seq_length
if
encoder_states
is
None
else
encoder_states
.
shape
[
0
]
if
self
.
is_cross_attention
:
query
=
self
.
query
(
hidden_states
)
query
=
query
.
view
(
-
1
,
bsz
,
self
.
num_heads
,
self
.
head_size
)
query
=
query
.
permute
(
1
,
2
,
0
,
3
)
# bsz, num_head, seq_len, head_size
if
past_key_value
is
not
None
:
key
,
value
=
past_key_value
elif
encoder_states
is
not
None
:
key_value
=
self
.
key_value
(
encoder_states
)
key_value
=
key_value
.
view
(
-
1
,
bsz
,
self
.
num_heads
,
2
*
self
.
head_size
)
key_value
=
key_value
.
permute
(
1
,
2
,
0
,
3
)
key
,
value
=
flow
.
chunk
(
key_value
,
chunks
=
2
,
dim
=-
1
)
else
:
raise
ValueError
(
"past_key_value and encoder_states cannot be None at the same time."
)
else
:
query_key_value
=
self
.
query_key_value
(
hidden_states
)
if
use_cache
:
query_key_value
=
query_key_value
.
view
(
bsz
,
-
1
,
self
.
num_heads
,
3
*
self
.
head_size
)
query_key_value
=
query_key_value
.
permute
(
0
,
2
,
1
,
3
)
# [bsz, num_heads, src_len, 3 * head_size]
query
,
key
,
value
=
flow
.
chunk
(
query_key_value
,
chunks
=
3
,
dim
=-
1
)
else
:
attention_scores
,
value
=
flow
.
_C
.
fused_self_attention
(
query_key_value
,
head_size
=
self
.
head_size
,
alpha
=
1
)
if
past_key_value
is
not
None
:
past_key
,
past_value
=
past_key_value
key
=
flow
.
cat
((
past_key
.
type_as
(
key
),
key
),
dim
=
2
)
value
=
flow
.
cat
((
past_value
.
type_as
(
value
),
value
),
dim
=
2
)
if
use_cache
:
past_key_value
=
(
key
,
value
)
if
self
.
is_cross_attention
or
use_cache
:
attention_scores
=
flow
.
matmul
(
query
,
key
,
transpose_b
=
True
,
alpha
=
1
)
if
position_bias
is
None
:
if
not
self
.
has_relative_attention_bias
:
position_bias
=
flow
.
zeros
(
(
1
,
self
.
num_heads
,
real_seq_length
,
key_length
),
sbp
=
dist
.
get_nd_sbp
([
flow
.
sbp
.
broadcast
,
flow
.
sbp
.
broadcast
]),
placement
=
attention_scores
.
placement
,
)
else
:
position_bias
=
self
.
compute_bias
(
real_seq_length
,
key_length
,
placement
=
attention_mask
.
placement
)
if
past_key_value
is
not
None
:
position_bias
=
position_bias
[:,
:,
-
hidden_states
.
size
(
1
)
:,
:]
if
attention_mask
is
not
None
:
if
use_cache
:
attention_mask
=
attention_mask
.
expand_as
(
attention_scores
)
attention_weights
=
flow
.
_C
.
fused_bias_add_scale_mask_softmax_dropout
(
attention_scores
,
position_bias
,
attention_mask
,
fill_value
=-
10000.0
,
scale
=
1
,
p
=
self
.
attention_dropout_prob
,
)[
0
]
else
:
attention_scores
=
attention_scores
+
position_bias
attention_weights
=
flow
.
softmax
(
attention_scores
,
dim
=-
1
)
attention_weights
=
self
.
dropout
(
attention_weights
)
context
=
flow
.
matmul
(
attention_weights
,
value
)
""" transpose [batch_size, num_head, seq_len, head_size] to
[seq_len, batch_size, num_head, head_size]
"""
context
=
flow
.
_C
.
transpose
(
context
,
perm
=
(
2
,
0
,
1
,
3
))
output
=
self
.
dense
(
context
.
flatten
(
2
))
output
=
self
.
output_dropout
(
output
)
if
use_cache
:
output
=
(
output
,
past_key_value
)
output
=
(
output
,)
+
(
position_bias
,)
return
output
def
extra_repr
(
self
)
->
str
:
return
"hidden_size={}, num_heads={}, is_cross_attention={}"
.
format
(
self
.
hidden_size
,
self
.
num_heads
,
self
.
is_cross_attention
,
)
def
_relative_position_bucket
(
self
,
relative_position
,
bidirectional
=
True
,
num_buckets
=
32
,
max_distance
=
128
):
relative_buckets
=
0
if
bidirectional
:
num_buckets
//=
2
relative_buckets
=
(
relative_buckets
+
(
relative_position
>
0
).
to
(
flow
.
long
)
*
num_buckets
)
relative_position
=
flow
.
abs
(
relative_position
)
else
:
relative_position
=
(
-
1
*
flow
.
min
(
relative_position
,
flow
.
zeros
(
relative_position
.
size
(),
sbp
=
relative_position
.
sbp
,
placement
=
relative_position
.
placement
,
),
).
to
(
flow
.
long
)
)
max_exact
=
num_buckets
//
2
is_small
=
relative_position
<
max_exact
relative_postion_if_large
=
max_exact
+
(
flow
.
log
(
relative_position
.
float
()
/
max_exact
)
/
math
.
log
(
max_distance
/
max_exact
)
*
(
num_buckets
-
max_exact
)
).
to
(
flow
.
long
)
relative_postion_if_large
=
flow
.
min
(
relative_postion_if_large
,
flow
.
zeros
(
relative_postion_if_large
.
size
(),
dtype
=
relative_postion_if_large
.
dtype
,
sbp
=
relative_postion_if_large
.
sbp
,
placement
=
relative_postion_if_large
.
placement
,
).
fill_
(
num_buckets
-
1
),
)
relative_buckets
=
relative_buckets
+
flow
.
where
(
is_small
,
relative_position
,
relative_postion_if_large
)
return
relative_buckets
def
compute_bias
(
self
,
query_length
,
key_length
,
placement
=
None
):
"""Compute binned relative position bias"""
context_position
=
flow
.
arange
(
query_length
,
dtype
=
flow
.
long
,
sbp
=
dist
.
get_nd_sbp
([
flow
.
sbp
.
broadcast
,
flow
.
sbp
.
broadcast
]),
placement
=
placement
,
)
memory_position
=
flow
.
arange
(
key_length
,
dtype
=
flow
.
long
,
sbp
=
dist
.
get_nd_sbp
([
flow
.
sbp
.
broadcast
,
flow
.
sbp
.
broadcast
]),
placement
=
placement
,
)
relative_position
=
(
memory_position
[
None
,
:]
-
context_position
[:,
None
]
)
# shape (query_length, key_length)
relative_position_bucket
=
self
.
_relative_position_bucket
(
relative_position
,
bidirectional
=
(
not
self
.
is_decoder
),
num_buckets
=
self
.
relative_attention_num_buckets
,
)
# shape (query_length, key_length)
values
=
self
.
relative_attention_bias
(
relative_position_bucket
)
# shape (query_length, key_length, num_heads)
values
=
values
.
permute
([
2
,
0
,
1
]).
unsqueeze
(
0
)
# shape (1, num_heads, query_length, key_length)
return
values
projects/MT5/layers/embed_layer.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
oneflow
as
flow
import
oneflow.nn
as
nn
from
oneflow.nn
import
init
import
libai.utils.distributed
as
dist
from
libai.layers.embedding
import
VocabEmbedding
class
MT5Embedding
(
flow
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
,
vocab_size
,
embedding_dropout_prob
,
pad_token_id
=
0
,
init_method
=
flow
.
nn
.
init
.
xavier_normal_
,
amp_enabled
=
False
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
vocab_size
=
vocab_size
self
.
word_embeddings
=
VocabEmbedding
(
num_embeddings
=
vocab_size
,
embedding_dim
=
hidden_size
,
init_method
=
init_method
,
amp_enabled
=
amp_enabled
,
padding_idx
=
pad_token_id
,
)
self
.
embedding_dropout
=
flow
.
nn
.
Dropout
(
embedding_dropout_prob
)
def
forward
(
self
,
input_ids
):
word_embeddings
=
self
.
word_embeddings
(
input_ids
)
embeddings
=
self
.
embedding_dropout
(
word_embeddings
)
return
embeddings
class
Embedding
(
nn
.
Module
):
"""Construct the trainable embedding module, which does not support parallelization.
This can be used for positional embedding and token type embedding.
Arguments:
num_embeddings: size of vocabulary.
embedding_dim: dimension of embeddings.
padding_idx: pad index. Defaults to None.
init_method: method to initialize weights. Defaults to ``flow.nn.init.xavier_normal_``.
amp_enabled: fp16 option for embedding weight. Defaults to False.
"""
def
__init__
(
self
,
num_embeddings
,
embedding_dim
,
padding_idx
=
None
,
init_method
=
init
.
xavier_normal_
,
amp_enabled
=
False
,
layer_idx
=
0
,
):
super
().
__init__
()
self
.
num_embeddings
=
num_embeddings
self
.
embedding_dim
=
embedding_dim
if
padding_idx
is
not
None
:
if
padding_idx
>
0
:
assert
(
padding_idx
<
self
.
num_embeddings
),
"Padding_idx must be within num_embeddings"
elif
padding_idx
<
0
:
assert
(
padding_idx
>=
-
self
.
num_embeddings
),
"Padding_idx must be within num_embeddings"
padding_idx
=
self
.
num_embeddings
+
padding_idx
self
.
padding_idx
=
padding_idx
self
.
init_method
=
init_method
self
.
amp_enabled
=
amp_enabled
assert
num_embeddings
>
0
self
.
weight
=
nn
.
Parameter
(
flow
.
empty
(
(
num_embeddings
,
embedding_dim
),
dtype
=
flow
.
float32
,
placement
=
dist
.
get_layer_placement
(
layer_idx
),
sbp
=
dist
.
get_nd_sbp
([
flow
.
sbp
.
broadcast
,
flow
.
sbp
.
broadcast
]),
)
)
self
.
init_method
(
self
.
weight
)
def
forward
(
self
,
input_ids
):
weight
=
flow
.
_C
.
amp_white_identity
(
self
.
weight
)
if
self
.
amp_enabled
else
self
.
weight
input_embeds
=
flow
.
_C
.
gather
(
weight
,
input_ids
,
axis
=
0
)
return
input_embeds
def
_fill_padding_idx_with_zero
(
self
)
->
None
:
if
self
.
padding_idx
is
not
None
:
with
flow
.
no_grad
():
self
.
weight
[
self
.
padding_idx
]
=
flow
.
zeros
(
self
.
embedding_dim
,
placement
=
dist
.
get_layer_placement
(
0
),
sbp
=
dist
.
get_nd_sbp
([
flow
.
sbp
.
broadcast
,
flow
.
sbp
.
broadcast
]),
)
def
extra_repr
(
self
)
->
str
:
s
=
"num_embeddings={num_embeddings}, embedding_dim={embedding_dim}"
if
self
.
padding_idx
is
not
None
:
s
+=
", padding_idx={padding_idx}"
return
s
.
format
(
**
self
.
__dict__
)
projects/MT5/layers/lm_head_layer.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
oneflow
import
nn
from
libai.layers
import
Linear
,
LMLogits
class
LMHead
(
nn
.
Module
):
def
__init__
(
self
,
model_type
,
hidden_size
,
vocab_size
,
hidden_layers
):
super
().
__init__
()
if
model_type
==
"mt5"
:
self
.
lm_head
=
Linear
(
hidden_size
,
vocab_size
,
bias
=
False
,
layer_idx
=
2
*
hidden_layers
-
1
)
else
:
self
.
lm_head
=
LMLogits
(
vocab_size
,
bias
=
True
)
def
forward
(
self
,
decoder_states
,
embed_weight
=
None
):
if
isinstance
(
self
.
lm_head
,
Linear
):
logits
=
self
.
lm_head
(
decoder_states
)
else
:
logits
=
self
.
lm_head
(
decoder_states
,
embed_weight
)
return
logits
projects/MT5/layers/logits_layer.py
0 → 100644
View file @
9fdb7dab
# coding=utf-8
# Copyright 2021 The OneFlow Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
oneflow
as
flow
from
oneflow
import
nn
from
libai.layers
import
Linear
from
libai.utils
import
distributed
as
dist
class
LMLogits
(
nn
.
Module
):
def
__init__
(
self
,
vocab_size
,
hidden_size
=
None
,
bias
=
False
,
model_type
=
"t5"
,
layer_idx
=-
1
):
super
().
__init__
()
self
.
model_type
=
model_type
if
model_type
==
"t5"
:
self
.
bias
=
(
nn
.
Parameter
(
flow
.
zeros
(
(
vocab_size
,),
dtype
=
flow
.
float32
,
placement
=
dist
.
get_layer_placement
(
layer_idx
),
sbp
=
dist
.
get_nd_sbp
([
flow
.
sbp
.
broadcast
,
flow
.
sbp
.
split
(
0
)]),
)
)
if
bias
else
None
)
elif
model_type
==
"mt5"
:
self
.
linear
=
Linear
(
hidden_size
,
vocab_size
,
bias
=
False
,
layer_idx
=
layer_idx
)
def
forward
(
self
,
input
,
word_embeddings
=
None
):
if
self
.
model_type
==
"t5"
:
w
=
word_embeddings
.
to_global
(
placement
=
input
.
placement
)
input
=
input
.
to_global
(
grad_sbp
=
input
.
sbp
)
logits
=
flow
.
_C
.
matmul
(
input
,
w
,
transpose_b
=
True
)
if
self
.
bias
is
not
None
:
logits
=
logits
+
self
.
bias
else
:
logits
=
self
.
linear
(
input
)
return
logits
Prev
1
…
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment