Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
316d3f90
Commit
316d3f90
authored
Jul 14, 2022
by
Pan,Huiwen
Browse files
增加ds框架测试模型
parent
aebde649
Changes
227
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
10153 additions
and
1 deletion
+10153
-1
DeepSpeed
DeepSpeed
+0
-1
Deepspeed/.pre-commit-config.yaml
Deepspeed/.pre-commit-config.yaml
+18
-0
Deepspeed/BingBertGlue/glue_bert_base.json
Deepspeed/BingBertGlue/glue_bert_base.json
+19
-0
Deepspeed/BingBertGlue/glue_bert_large.json
Deepspeed/BingBertGlue/glue_bert_large.json
+19
-0
Deepspeed/BingBertGlue/nvidia/modeling.py
Deepspeed/BingBertGlue/nvidia/modeling.py
+1459
-0
Deepspeed/BingBertGlue/nvidia/modelingpreln.py
Deepspeed/BingBertGlue/nvidia/modelingpreln.py
+1626
-0
Deepspeed/BingBertGlue/nvidia/modelingpreln_layerdrop.py
Deepspeed/BingBertGlue/nvidia/modelingpreln_layerdrop.py
+1662
-0
Deepspeed/BingBertGlue/nvidia_bert_dataset_provider.py
Deepspeed/BingBertGlue/nvidia_bert_dataset_provider.py
+169
-0
Deepspeed/BingBertGlue/pytorch_pretrained_bert/__init__.py
Deepspeed/BingBertGlue/pytorch_pretrained_bert/__init__.py
+8
-0
Deepspeed/BingBertGlue/pytorch_pretrained_bert/__main__.py
Deepspeed/BingBertGlue/pytorch_pretrained_bert/__main__.py
+28
-0
Deepspeed/BingBertGlue/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
...torch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+117
-0
Deepspeed/BingBertGlue/pytorch_pretrained_bert/file_utils.py
Deepspeed/BingBertGlue/pytorch_pretrained_bert/file_utils.py
+246
-0
Deepspeed/BingBertGlue/pytorch_pretrained_bert/modeling.py
Deepspeed/BingBertGlue/pytorch_pretrained_bert/modeling.py
+1254
-0
Deepspeed/BingBertGlue/pytorch_pretrained_bert/optimization.py
...peed/BingBertGlue/pytorch_pretrained_bert/optimization.py
+237
-0
Deepspeed/BingBertGlue/pytorch_pretrained_bert/tokenization.py
...peed/BingBertGlue/pytorch_pretrained_bert/tokenization.py
+386
-0
Deepspeed/BingBertGlue/run_glue_bert_base_finetune.sh
Deepspeed/BingBertGlue/run_glue_bert_base_finetune.sh
+55
-0
Deepspeed/BingBertGlue/run_glue_bert_large_finetune.sh
Deepspeed/BingBertGlue/run_glue_bert_large_finetune.sh
+55
-0
Deepspeed/BingBertGlue/run_glue_classifier_bert_base.py
Deepspeed/BingBertGlue/run_glue_classifier_bert_base.py
+1145
-0
Deepspeed/BingBertGlue/run_glue_classifier_bert_large.py
Deepspeed/BingBertGlue/run_glue_classifier_bert_large.py
+1260
-0
Deepspeed/BingBertGlue/turing/dataset.py
Deepspeed/BingBertGlue/turing/dataset.py
+390
-0
No files found.
Too many changes to show.
To preserve performance only
227 of 227+
files are displayed.
Plain diff
Email patch
DeepSpeed
@
6bd444a7
Subproject commit 6bd444a7c62e9d7d320dd4c1e1142062f50c861d
Deepspeed/.pre-commit-config.yaml
0 → 100644
View file @
316d3f90
repos
:
-
repo
:
https://github.com/pre-commit/pre-commit-hooks
rev
:
v1.2.3
hooks
:
-
id
:
trailing-whitespace
exclude
:
"
Megatron-LM/"
-
id
:
check-yaml
exclude
:
"
Megatron-LM/"
-
id
:
end-of-file-fixer
exclude
:
"
Megatron-LM/"
-
repo
:
https://github.com/pre-commit/mirrors-yapf
rev
:
v0.29.0
hooks
:
-
id
:
yapf
exclude
:
"
Megatron-LM/"
Deepspeed/BingBertGlue/glue_bert_base.json
0 → 100644
View file @
316d3f90
{
"train_batch_size"
:
32
,
"train_micro_batch_size_per_gpu"
:
1
,
"steps_per_print"
:
10
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
2e-5
,
"weight_decay"
:
0.0
,
"bias_correction"
:
true
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
false
}
}
\ No newline at end of file
Deepspeed/BingBertGlue/glue_bert_large.json
0 → 100644
View file @
316d3f90
{
"train_batch_size"
:
32
,
"train_micro_batch_size_per_gpu"
:
1
,
"steps_per_print"
:
10
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
2e-5
,
"weight_decay"
:
0.0
,
"bias_correction"
:
true
}
},
"gradient_clipping"
:
1.0
,
"fp16"
:
{
"enabled"
:
false
}
}
\ No newline at end of file
Deepspeed/BingBertGlue/nvidia/modeling.py
0 → 100644
View file @
316d3f90
# DeepSpeed note, code taken from commit 3d59216cec89a363649b4fe3d15295ba936ced0f
# https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
copy
import
json
import
logging
import
math
import
os
import
shutil
import
tarfile
import
tempfile
import
sys
from
io
import
open
import
torch
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
from
torch.utils
import
checkpoint
import
torch.distributed
as
dist
from
turing.file_utils
import
cached_path
from
torch.nn
import
Module
from
torch.nn.parameter
import
Parameter
import
torch.nn.functional
as
F
import
torch.nn.init
as
init
logger
=
logging
.
getLogger
(
__name__
)
PRETRAINED_MODEL_ARCHIVE_MAP
=
{
'bert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz"
,
'bert-large-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz"
,
'bert-base-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz"
,
'bert-large-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz"
,
'bert-base-multilingual-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz"
,
'bert-base-multilingual-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz"
,
'bert-base-chinese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz"
,
}
CONFIG_NAME
=
'bert_config.json'
WEIGHTS_NAME
=
'pytorch_model.bin'
TF_WEIGHTS_NAME
=
'model.ckpt'
def
load_tf_weights_in_bert
(
model
,
tf_checkpoint_path
):
""" Load tf checkpoints in a pytorch model
"""
try
:
import
re
import
numpy
as
np
import
tensorflow
as
tf
except
ImportError
:
print
(
"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path
=
os
.
path
.
abspath
(
tf_checkpoint_path
)
print
(
"Converting TensorFlow checkpoint from {}"
.
format
(
tf_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
print
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
for
name
,
array
in
zip
(
names
,
arrays
):
name
=
name
.
split
(
'/'
)
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if
any
(
n
in
[
"adam_v"
,
"adam_m"
]
for
n
in
name
):
print
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+_\d+'
,
m_name
):
l
=
re
.
split
(
r
'_(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
if
l
[
0
]
==
'kernel'
or
l
[
0
]
==
'gamma'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'output_bias'
or
l
[
0
]
==
'beta'
:
pointer
=
getattr
(
pointer
,
'bias'
)
elif
l
[
0
]
==
'output_weights'
:
pointer
=
getattr
(
pointer
,
'weight'
)
else
:
pointer
=
getattr
(
pointer
,
l
[
0
])
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
if
m_name
[
-
11
:]
==
'_embeddings'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
m_name
==
'kernel'
:
array
=
np
.
transpose
(
array
)
try
:
assert
pointer
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
array
.
shape
)
raise
print
(
"Initialize PyTorch weight {}"
.
format
(
name
))
pointer
.
data
=
torch
.
from_numpy
(
array
)
return
model
@
torch
.
jit
.
script
def
f_gelu
(
x
):
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
1.41421
))
@
torch
.
jit
.
script
def
bias_gelu
(
bias
,
y
):
x
=
bias
+
y
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
1.41421
))
@
torch
.
jit
.
script
def
bias_tanh
(
bias
,
y
):
x
=
bias
+
y
return
torch
.
tanh
(
x
)
def
gelu
(
x
):
"""Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return
f_gelu
(
x
)
def
swish
(
x
):
return
x
*
torch
.
sigmoid
(
x
)
ACT2FN
=
{
"gelu"
:
gelu
,
"relu"
:
torch
.
nn
.
functional
.
relu
,
"swish"
:
swish
}
class
LinearActivation
(
Module
):
r
"""Fused Linear and activation Module.
"""
__constants__
=
[
'bias'
]
def
__init__
(
self
,
in_features
,
out_features
,
act
=
'gelu'
,
bias
=
True
):
super
(
LinearActivation
,
self
).
__init__
()
self
.
in_features
=
in_features
self
.
out_features
=
out_features
self
.
fused_gelu
=
False
self
.
fused_tanh
=
False
if
isinstance
(
act
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
act
,
unicode
)):
if
bias
and
act
==
'gelu'
:
self
.
fused_gelu
=
True
elif
bias
and
act
==
'tanh'
:
self
.
fused_tanh
=
True
else
:
self
.
act_fn
=
ACT2FN
[
act
]
else
:
self
.
act_fn
=
act
self
.
weight
=
Parameter
(
torch
.
Tensor
(
out_features
,
in_features
))
if
bias
:
self
.
bias
=
Parameter
(
torch
.
Tensor
(
out_features
))
else
:
self
.
register_parameter
(
'bias'
,
None
)
self
.
reset_parameters
()
def
reset_parameters
(
self
):
init
.
kaiming_uniform_
(
self
.
weight
,
a
=
math
.
sqrt
(
5
))
if
self
.
bias
is
not
None
:
fan_in
,
_
=
init
.
_calculate_fan_in_and_fan_out
(
self
.
weight
)
bound
=
1
/
math
.
sqrt
(
fan_in
)
init
.
uniform_
(
self
.
bias
,
-
bound
,
bound
)
def
forward
(
self
,
input
):
if
self
.
fused_gelu
:
return
bias_gelu
(
self
.
bias
,
F
.
linear
(
input
,
self
.
weight
,
None
))
elif
self
.
fused_tanh
:
return
bias_tanh
(
self
.
bias
,
F
.
linear
(
input
,
self
.
weight
,
None
))
else
:
return
self
.
act_fn
(
F
.
linear
(
input
,
self
.
weight
,
self
.
bias
))
def
extra_repr
(
self
):
return
'in_features={}, out_features={}, bias={}'
.
format
(
self
.
in_features
,
self
.
out_features
,
self
.
bias
is
not
None
)
class
BertConfig
(
object
):
"""Configuration class to store the configuration of a `BertModel`.
"""
def
__init__
(
self
,
vocab_size_or_config_json_file
,
hidden_size
=
768
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
intermediate_size
=
3072
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
2
,
initializer_range
=
0.02
):
"""Constructs BertConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
if
isinstance
(
vocab_size_or_config_json_file
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
vocab_size_or_config_json_file
,
unicode
)):
with
open
(
vocab_size_or_config_json_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
json_config
=
json
.
loads
(
reader
.
read
())
for
key
,
value
in
json_config
.
items
():
self
.
__dict__
[
key
]
=
value
elif
isinstance
(
vocab_size_or_config_json_file
,
int
):
self
.
vocab_size
=
vocab_size_or_config_json_file
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
hidden_act
=
hidden_act
self
.
intermediate_size
=
intermediate_size
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
initializer_range
=
initializer_range
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@
classmethod
def
from_dict
(
cls
,
json_object
):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
config
=
BertConfig
(
vocab_size_or_config_json_file
=-
1
)
for
key
,
value
in
json_object
.
items
():
config
.
__dict__
[
key
]
=
value
return
config
@
classmethod
def
from_json_file
(
cls
,
json_file
):
"""Constructs a `BertConfig` from a json file of parameters."""
with
open
(
json_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
text
=
reader
.
read
()
return
cls
.
from_dict
(
json
.
loads
(
text
))
def
__repr__
(
self
):
return
str
(
self
.
to_json_string
())
def
to_dict
(
self
):
"""Serializes this instance to a Python dictionary."""
output
=
copy
.
deepcopy
(
self
.
__dict__
)
return
output
def
to_json_string
(
self
):
"""Serializes this instance to a JSON string."""
return
json
.
dumps
(
self
.
to_dict
(),
indent
=
2
,
sort_keys
=
True
)
+
"
\n
"
try
:
import
apex
#apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
import
apex.normalization
#apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
BertLayerNorm
=
apex
.
normalization
.
FusedLayerNorm
except
ImportError
:
print
(
"Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex."
)
class
BertLayerNorm
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
,
eps
=
1e-12
):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super
(
BertLayerNorm
,
self
).
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
hidden_size
))
self
.
variance_epsilon
=
eps
def
forward
(
self
,
x
):
u
=
x
.
mean
(
-
1
,
keepdim
=
True
)
s
=
(
x
-
u
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
x
=
(
x
-
u
)
/
torch
.
sqrt
(
s
+
self
.
variance_epsilon
)
return
self
.
weight
*
x
+
self
.
bias
class
BertEmbeddings
(
nn
.
Module
):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def
__init__
(
self
,
config
):
super
(
BertEmbeddings
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
type_vocab_size
,
config
.
hidden_size
)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
seq_length
=
input_ids
.
size
(
1
)
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
return
embeddings
class
BertSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertSelfAttention
,
self
).
__init__
()
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
config
.
hidden_size
,
config
.
num_attention_heads
))
self
.
num_attention_heads
=
config
.
num_attention_heads
self
.
attention_head_size
=
int
(
config
.
hidden_size
/
config
.
num_attention_heads
)
self
.
all_head_size
=
self
.
num_attention_heads
*
self
.
attention_head_size
self
.
query
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
key
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
value
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
self
.
softmax
=
nn
.
Softmax
(
dim
=-
1
)
def
transpose_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
transpose_key_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
return
x
.
permute
(
0
,
2
,
3
,
1
)
def
forward
(
self
,
hidden_states
,
attention_mask
):
mixed_query_layer
=
self
.
query
(
hidden_states
)
mixed_key_layer
=
self
.
key
(
hidden_states
)
mixed_value_layer
=
self
.
value
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
mixed_query_layer
)
key_layer
=
self
.
transpose_key_for_scores
(
mixed_key_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
)
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores
=
attention_scores
+
attention_mask
# Normalize the attention scores to probabilities.
attention_probs
=
self
.
softmax
(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,
)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
return
context_layer
class
BertSelfOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertSelfOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
class
BertAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertAttention
,
self
).
__init__
()
self
.
self
=
BertSelfAttention
(
config
)
self
.
output
=
BertSelfOutput
(
config
)
def
forward
(
self
,
input_tensor
,
attention_mask
):
self_output
=
self
.
self
(
input_tensor
,
attention_mask
)
attention_output
=
self
.
output
(
self_output
,
input_tensor
)
return
attention_output
class
BertIntermediate
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertIntermediate
,
self
).
__init__
()
self
.
dense_act
=
LinearActivation
(
config
.
hidden_size
,
config
.
intermediate_size
,
act
=
config
.
hidden_act
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense_act
(
hidden_states
)
return
hidden_states
class
BertOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
class
BertLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertLayer
,
self
).
__init__
()
self
.
attention
=
BertAttention
(
config
)
self
.
intermediate
=
BertIntermediate
(
config
)
self
.
output
=
BertOutput
(
config
)
def
forward
(
self
,
hidden_states
,
attention_mask
):
attention_output
=
self
.
attention
(
hidden_states
,
attention_mask
)
intermediate_output
=
self
.
intermediate
(
attention_output
)
layer_output
=
self
.
output
(
intermediate_output
,
attention_output
)
return
layer_output
class
BertEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
,
args
):
super
(
BertEncoder
,
self
).
__init__
()
if
args
.
deepspeed_transformer_kernel
:
from
deepspeed
import
DeepSpeedTransformerLayer
,
DeepSpeedTransformerConfig
,
DeepSpeedConfig
if
hasattr
(
args
,
'deepspeed_config'
)
and
args
.
deepspeed_config
:
ds_config
=
DeepSpeedConfig
(
args
.
deepspeed_config
)
else
:
raise
RuntimeError
(
'deepspeed_config is not found in args.'
)
cuda_config
=
DeepSpeedTransformerConfig
(
batch_size
=
ds_config
.
train_micro_batch_size_per_gpu
,
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
heads
=
config
.
num_attention_heads
,
attn_dropout_ratio
=
config
.
attention_probs_dropout_prob
,
hidden_dropout_ratio
=
config
.
hidden_dropout_prob
,
num_hidden_layers
=
config
.
num_hidden_layers
,
initializer_range
=
config
.
initializer_range
,
local_rank
=
args
.
local_rank
,
seed
=
args
.
seed
,
fp16
=
ds_config
.
fp16_enabled
,
pre_layer_norm
=
False
)
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
DeepSpeedTransformerLayer
(
cuda_config
))
for
_
in
range
(
config
.
num_hidden_layers
)
])
else
:
layer
=
BertLayer
(
config
)
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
def
forward
(
self
,
hidden_states
,
attention_mask
,
output_all_encoded_layers
=
True
,
checkpoint_activations
=
False
):
all_encoder_layers
=
[]
def
custom
(
start
,
end
):
def
custom_forward
(
*
inputs
):
layers
=
self
.
layer
[
start
:
end
]
x_
=
inputs
[
0
]
for
layer
in
layers
:
x_
=
layer
(
x_
,
inputs
[
1
])
return
x_
return
custom_forward
if
checkpoint_activations
:
l
=
0
num_layers
=
len
(
self
.
layer
)
chunk_length
=
math
.
ceil
(
math
.
sqrt
(
num_layers
))
while
l
<
num_layers
:
hidden_states
=
checkpoint
.
checkpoint
(
custom
(
l
,
l
+
chunk_length
),
hidden_states
,
attention_mask
*
1
)
l
+=
chunk_length
# decoder layers
else
:
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
)
if
output_all_encoded_layers
:
all_encoder_layers
.
append
(
hidden_states
)
if
not
output_all_encoded_layers
or
checkpoint_activations
:
all_encoder_layers
.
append
(
hidden_states
)
return
all_encoder_layers
#class BertEncoder(nn.Module):
# def __init__(self, config):
# super(BertEncoder, self).__init__()
# layer = BertLayer(config)
# self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
#
# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
class
BertPooler
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertPooler
,
self
).
__init__
()
self
.
dense_act
=
LinearActivation
(
config
.
hidden_size
,
config
.
hidden_size
,
act
=
"tanh"
)
def
forward
(
self
,
hidden_states
):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor
=
hidden_states
[:,
0
]
pooled_output
=
self
.
dense_act
(
first_token_tensor
)
return
pooled_output
class
BertPredictionHeadTransform
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertPredictionHeadTransform
,
self
).
__init__
()
self
.
dense_act
=
LinearActivation
(
config
.
hidden_size
,
config
.
hidden_size
,
act
=
config
.
hidden_act
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense_act
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
)
return
hidden_states
class
BertLMPredictionHead
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertLMPredictionHead
,
self
).
__init__
()
self
.
transform
=
BertPredictionHeadTransform
(
config
)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self
.
decoder
=
nn
.
Linear
(
bert_model_embedding_weights
.
size
(
1
),
bert_model_embedding_weights
.
size
(
0
),
bias
=
False
)
self
.
decoder
.
weight
=
bert_model_embedding_weights
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
bert_model_embedding_weights
.
size
(
0
)))
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
transform
(
hidden_states
)
torch
.
cuda
.
nvtx
.
range_push
(
"decoder input.size() = {}, weight.size() = {}"
.
format
(
hidden_states
.
size
(),
self
.
decoder
.
weight
.
size
()))
hidden_states
=
self
.
decoder
(
hidden_states
)
+
self
.
bias
torch
.
cuda
.
nvtx
.
range_pop
()
return
hidden_states
class
BertOnlyMLMHead
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertOnlyMLMHead
,
self
).
__init__
()
self
.
predictions
=
BertLMPredictionHead
(
config
,
bert_model_embedding_weights
)
def
forward
(
self
,
sequence_output
):
prediction_scores
=
self
.
predictions
(
sequence_output
)
return
prediction_scores
class
BertOnlyNSPHead
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertOnlyNSPHead
,
self
).
__init__
()
self
.
seq_relationship
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
def
forward
(
self
,
pooled_output
):
seq_relationship_score
=
self
.
seq_relationship
(
pooled_output
)
return
seq_relationship_score
class
BertPreTrainingHeads
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertPreTrainingHeads
,
self
).
__init__
()
self
.
predictions
=
BertLMPredictionHead
(
config
,
bert_model_embedding_weights
)
self
.
seq_relationship
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
def
forward
(
self
,
sequence_output
,
pooled_output
):
prediction_scores
=
self
.
predictions
(
sequence_output
)
seq_relationship_score
=
self
.
seq_relationship
(
pooled_output
)
return
prediction_scores
,
seq_relationship_score
class
BertPreTrainedModel
(
nn
.
Module
):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
BertPreTrainedModel
,
self
).
__init__
()
if
not
isinstance
(
config
,
BertConfig
):
raise
ValueError
(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`"
.
format
(
self
.
__class__
.
__name__
,
self
.
__class__
.
__name__
))
self
.
config
=
config
def
init_bert_weights
(
self
,
module
):
""" Initialize the weights.
"""
if
isinstance
(
module
,
(
nn
.
Linear
,
nn
.
Embedding
)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
self
.
config
.
initializer_range
)
elif
isinstance
(
module
,
BertLayerNorm
):
module
.
bias
.
data
.
zero_
()
module
.
weight
.
data
.
fill_
(
1.0
)
if
isinstance
(
module
,
nn
.
Linear
)
and
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
state_dict
=
None
,
cache_dir
=
None
,
from_tf
=
False
,
*
inputs
,
**
kwargs
):
"""
Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
if
pretrained_model_name_or_path
in
PRETRAINED_MODEL_ARCHIVE_MAP
:
archive_file
=
PRETRAINED_MODEL_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
else
:
archive_file
=
pretrained_model_name_or_path
# redirect to the cache, if necessary
try
:
resolved_archive_file
=
cached_path
(
archive_file
,
cache_dir
=
cache_dir
)
except
EnvironmentError
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
()),
archive_file
))
return
None
if
resolved_archive_file
==
archive_file
:
logger
.
info
(
"loading archive file {}"
.
format
(
archive_file
))
else
:
logger
.
info
(
"loading archive file {} from cache at {}"
.
format
(
archive_file
,
resolved_archive_file
))
tempdir
=
None
if
os
.
path
.
isdir
(
resolved_archive_file
)
or
from_tf
:
serialization_dir
=
resolved_archive_file
else
:
# Extract archive to temp dir
tempdir
=
tempfile
.
mkdtemp
()
logger
.
info
(
"extracting archive file {} to temp dir {}"
.
format
(
resolved_archive_file
,
tempdir
))
with
tarfile
.
open
(
resolved_archive_file
,
'r:gz'
)
as
archive
:
archive
.
extractall
(
tempdir
)
serialization_dir
=
tempdir
# Load config
config_file
=
os
.
path
.
join
(
serialization_dir
,
CONFIG_NAME
)
config
=
BertConfig
.
from_json_file
(
config_file
)
logger
.
info
(
"Model config {}"
.
format
(
config
))
# Instantiate model.
model
=
cls
(
config
,
*
inputs
,
**
kwargs
)
if
state_dict
is
None
and
not
from_tf
:
weights_path
=
os
.
path
.
join
(
serialization_dir
,
WEIGHTS_NAME
)
state_dict
=
torch
.
load
(
weights_path
,
map_location
=
'cpu'
if
not
torch
.
cuda
.
is_available
()
else
None
)
if
tempdir
:
# Clean up temp dir
shutil
.
rmtree
(
tempdir
)
if
from_tf
:
# Directly load from a TensorFlow checkpoint
weights_path
=
os
.
path
.
join
(
serialization_dir
,
TF_WEIGHTS_NAME
)
return
load_tf_weights_in_bert
(
model
,
weights_path
)
# Load from a PyTorch state_dict
old_keys
=
[]
new_keys
=
[]
for
key
in
state_dict
.
keys
():
new_key
=
None
if
'gamma'
in
key
:
new_key
=
key
.
replace
(
'gamma'
,
'weight'
)
if
'beta'
in
key
:
new_key
=
key
.
replace
(
'beta'
,
'bias'
)
if
new_key
:
old_keys
.
append
(
key
)
new_keys
.
append
(
new_key
)
for
old_key
,
new_key
in
zip
(
old_keys
,
new_keys
):
state_dict
[
new_key
]
=
state_dict
.
pop
(
old_key
)
missing_keys
=
[]
unexpected_keys
=
[]
error_msgs
=
[]
# copy state_dict so _load_from_state_dict can modify it
metadata
=
getattr
(
state_dict
,
'_metadata'
,
None
)
state_dict
=
state_dict
.
copy
()
if
metadata
is
not
None
:
state_dict
.
_metadata
=
metadata
def
load
(
module
,
prefix
=
''
):
local_metadata
=
{}
if
metadata
is
None
else
metadata
.
get
(
prefix
[:
-
1
],
{})
module
.
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
True
,
missing_keys
,
unexpected_keys
,
error_msgs
)
for
name
,
child
in
module
.
_modules
.
items
():
if
child
is
not
None
:
load
(
child
,
prefix
+
name
+
'.'
)
start_prefix
=
''
if
not
hasattr
(
model
,
'bert'
)
and
any
(
s
.
startswith
(
'bert.'
)
for
s
in
state_dict
.
keys
()):
start_prefix
=
'bert.'
load
(
model
,
prefix
=
start_prefix
)
if
len
(
missing_keys
)
>
0
:
logger
.
info
(
"Weights of {} not initialized from pretrained model: {}"
.
format
(
model
.
__class__
.
__name__
,
missing_keys
))
if
len
(
unexpected_keys
)
>
0
:
logger
.
info
(
"Weights from pretrained model not used in {}: {}"
.
format
(
model
.
__class__
.
__name__
,
unexpected_keys
))
if
len
(
error_msgs
)
>
0
:
raise
RuntimeError
(
'Error(s) in loading state_dict for {}:
\n\t
{}'
.
format
(
model
.
__class__
.
__name__
,
"
\n\t
"
.
join
(
error_msgs
)))
return
model
class
BertModel
(
BertPreTrainedModel
):
"""BERT model ("Bidirectional Embedding Representations from a Transformer").
Params:
config: a BertConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.BertModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
args
):
super
(
BertModel
,
self
).
__init__
(
config
)
self
.
embeddings
=
BertEmbeddings
(
config
)
self
.
encoder
=
BertEncoder
(
config
,
args
)
self
.
pooler
=
BertPooler
(
config
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
output_all_encoded_layers
=
True
,
checkpoint_activations
=
False
):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask
=
extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
encoded_layers
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
,
output_all_encoded_layers
=
output_all_encoded_layers
,
checkpoint_activations
=
checkpoint_activations
)
sequence_output
=
encoded_layers
[
-
1
]
pooled_output
=
self
.
pooler
(
sequence_output
)
if
not
output_all_encoded_layers
:
encoded_layers
=
encoded_layers
[
-
1
]
return
encoded_layers
,
pooled_output
class
BertForPreTraining
(
BertPreTrainedModel
):
"""BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads:
- the masked language modeling head, and
- the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `masked_lm_labels` and `next_sentence_label` are not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `masked_lm_labels` or `next_sentence_label` is `None`:
Outputs a tuple comprising
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForPreTraining(config)
masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
args
):
super
(
BertForPreTraining
,
self
).
__init__
(
config
)
self
.
summary_writer
=
None
if
dist
.
get_rank
()
==
0
:
self
.
summary_writer
=
args
.
summary_writer
self
.
samples_per_step
=
dist
.
get_world_size
()
*
args
.
train_batch_size
self
.
sample_count
=
self
.
samples_per_step
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertPreTrainingHeads
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
apply
(
self
.
init_bert_weights
)
def
log_summary_writer
(
self
,
logs
:
dict
,
base
=
'Train'
):
if
dist
.
get_rank
()
==
0
:
module_name
=
"Samples"
#self._batch_module_name.get(batch_type, self._get_batch_type_error(batch_type))
for
key
,
log
in
logs
.
items
():
self
.
summary_writer
.
add_scalar
(
f
'
{
base
}
/
{
module_name
}
/
{
key
}
'
,
log
,
self
.
sample_count
)
self
.
sample_count
+=
self
.
samples_per_step
def
forward
(
self
,
batch
,
log
=
True
):
#input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None, checkpoint_activations=False):
input_ids
=
batch
[
1
]
token_type_ids
=
batch
[
3
]
attention_mask
=
batch
[
2
]
masked_lm_labels
=
batch
[
5
]
next_sentence_label
=
batch
[
4
]
checkpoint_activations
=
False
sequence_output
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
,
checkpoint_activations
=
checkpoint_activations
)
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
)
if
masked_lm_labels
is
not
None
and
next_sentence_label
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
#print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
total_loss
=
masked_lm_loss
+
next_sentence_loss
# if log:
# self.log_summary_writer(logs={'train_loss': total_loss.item()})
return
total_loss
else
:
return
prediction_scores
,
seq_relationship_score
class
BertForMaskedLM
(
BertPreTrainedModel
):
"""BERT model with the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
Outputs:
if `masked_lm_labels` is not `None`:
Outputs the masked language modeling loss.
if `masked_lm_labels` is `None`:
Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForMaskedLM(config)
masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForMaskedLM
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertOnlyMLMHead
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
checkpoint_activations
=
False
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
prediction_scores
=
self
.
cls
(
sequence_output
)
if
masked_lm_labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
return
masked_lm_loss
else
:
return
prediction_scores
class
BertForNextSentencePrediction
(
BertPreTrainedModel
):
"""BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `next_sentence_label` is not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `next_sentence_label` is `None`:
Outputs the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForNextSentencePrediction(config)
seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForNextSentencePrediction
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertOnlyNSPHead
(
config
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
next_sentence_label
=
None
,
checkpoint_activations
=
False
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
seq_relationship_score
=
self
.
cls
(
pooled_output
)
if
next_sentence_label
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
return
next_sentence_loss
else
:
return
seq_relationship_score
class
BertForSequenceClassification
(
BertPreTrainedModel
):
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForSequenceClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
args
,
config
,
num_labels
):
super
(
BertForSequenceClassification
,
self
).
__init__
(
config
)
self
.
num_labels
=
num_labels
self
.
bert
=
BertModel
(
config
,
args
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
checkpoint_activations
=
False
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
return
loss
else
:
return
logits
class
BertForMultipleChoice
(
BertPreTrainedModel
):
"""BERT model for multiple choice tasks.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_choices`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_choices = 2
model = BertForMultipleChoice(config, num_choices)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
num_choices
):
super
(
BertForMultipleChoice
,
self
).
__init__
(
config
)
self
.
num_choices
=
num_choices
self
.
bert
=
BertModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
1
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
checkpoint_activations
=
False
):
flat_input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
flat_token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
flat_attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
_
,
pooled_output
=
self
.
bert
(
flat_input_ids
,
flat_token_type_ids
,
flat_attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
reshaped_logits
=
logits
.
view
(
-
1
,
self
.
num_choices
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
reshaped_logits
,
labels
)
return
loss
else
:
return
reshaped_logits
class
BertForTokenClassification
(
BertPreTrainedModel
):
"""BERT model for token-level classification.
This module is composed of the BERT model with a linear layer on top of
the full hidden state of the last layer.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForTokenClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
num_labels
):
super
(
BertForTokenClassification
,
self
).
__init__
(
config
)
self
.
num_labels
=
num_labels
self
.
bert
=
BertModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
checkpoint_activations
=
False
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
sequence_output
=
self
.
dropout
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
# Only keep active parts of the loss
if
attention_mask
is
not
None
:
active_loss
=
attention_mask
.
view
(
-
1
)
==
1
active_logits
=
logits
.
view
(
-
1
,
self
.
num_labels
)[
active_loss
]
active_labels
=
labels
.
view
(
-
1
)[
active_loss
]
loss
=
loss_fct
(
active_logits
,
active_labels
)
else
:
loss
=
loss_fct
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
return
loss
else
:
return
logits
class
BertForQuestionAnswering
(
BertPreTrainedModel
):
"""BERT model for Question Answering (span extraction).
This module is composed of the BERT model with a linear layer on top of
the sequence output that computes start_logits and end_logits
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
Outputs:
if `start_positions` and `end_positions` are not `None`:
Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
if `start_positions` or `end_positions` is `None`:
Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
position tokens of shape [batch_size, sequence_length].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForQuestionAnswering(config)
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
args
):
super
(
BertForQuestionAnswering
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
,
args
)
# TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
self
.
qa_outputs
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
checkpoint_activations
=
False
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
logits
=
self
.
qa_outputs
(
sequence_output
)
start_logits
,
end_logits
=
logits
.
split
(
1
,
dim
=-
1
)
start_logits
=
start_logits
.
squeeze
(
-
1
)
end_logits
=
end_logits
.
squeeze
(
-
1
)
if
start_positions
is
not
None
and
end_positions
is
not
None
:
# If we are on multi-GPU, split add a dimension
if
len
(
start_positions
.
size
())
>
1
:
start_positions
=
start_positions
.
squeeze
(
-
1
)
if
len
(
end_positions
.
size
())
>
1
:
end_positions
=
end_positions
.
squeeze
(
-
1
)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index
=
start_logits
.
size
(
1
)
start_positions
.
clamp_
(
0
,
ignored_index
)
end_positions
.
clamp_
(
0
,
ignored_index
)
loss_fct
=
CrossEntropyLoss
(
ignore_index
=
ignored_index
)
start_loss
=
loss_fct
(
start_logits
,
start_positions
)
end_loss
=
loss_fct
(
end_logits
,
end_positions
)
total_loss
=
(
start_loss
+
end_loss
)
/
2
return
total_loss
else
:
return
start_logits
,
end_logits
\ No newline at end of file
Deepspeed/BingBertGlue/nvidia/modelingpreln.py
0 → 100644
View file @
316d3f90
# DeepSpeed note, code taken from commit 3d59216cec89a363649b4fe3d15295ba936ced0f
# https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
copy
import
json
import
logging
import
math
import
os
import
shutil
import
tarfile
import
tempfile
import
sys
from
io
import
open
import
torch
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
from
torch.utils
import
checkpoint
from
turing.file_utils
import
cached_path
from
torch.nn
import
Module
from
torch.nn.parameter
import
Parameter
import
torch.nn.functional
as
F
import
torch.nn.init
as
init
logger
=
logging
.
getLogger
(
__name__
)
PRETRAINED_MODEL_ARCHIVE_MAP
=
{
'bert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz"
,
'bert-large-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz"
,
'bert-base-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz"
,
'bert-large-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz"
,
'bert-base-multilingual-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz"
,
'bert-base-multilingual-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz"
,
'bert-base-chinese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz"
,
}
CONFIG_NAME
=
'bert_config.json'
WEIGHTS_NAME
=
'pytorch_model.bin'
TF_WEIGHTS_NAME
=
'model.ckpt'
def
get_deepspeed_config
(
args
):
if
hasattr
(
args
,
'deepspeed_config'
)
and
args
.
deepspeed_config
:
from
deepspeed
import
DeepSpeedConfig
return
DeepSpeedConfig
(
args
.
deepspeed_config
)
else
:
raise
RuntimeError
(
'deepspeed_config is not found in args.'
)
def
get_sparse_attention_config
(
args
,
num_heads
):
if
args
.
deepspeed_sparse_attention
:
ds_config
=
get_deepspeed_config
(
args
)
if
hasattr
(
ds_config
,
'sparse_attention'
)
and
ds_config
.
sparse_attention
:
sa_config
=
ds_config
.
sparse_attention
sa_mode
=
sa_config
.
get
(
'mode'
)
if
(
sa_mode
==
'dense'
):
from
deepspeed.ops.sparse_attention
import
DenseSparsityConfig
as
STConfig
elif
(
sa_mode
==
'fixed'
):
from
deepspeed.ops.sparse_attention
import
FixedSparsityConfig
as
STConfig
elif
(
sa_mode
==
'bigbird'
):
from
deepspeed.ops.sparse_attention
import
BigBirdSparsityConfig
as
STConfig
elif
(
sa_mode
==
'bslongformer'
):
from
deepspeed.ops.sparse_attention
import
BSLongformerSparsityConfig
as
STConfig
elif
(
sa_mode
==
'variable'
):
from
deepspeed.ops.sparse_attention
import
VariableSparsityConfig
as
STConfig
else
:
raise
NotImplementedError
(
f
'Given sparsity mode,
{
sa_mode
}
, has not been implemented yet!'
)
del
sa_config
[
'mode'
]
return
STConfig
(
num_heads
=
num_heads
,
**
sa_config
)
else
:
from
deepspeed.ops.sparse_attention
import
FixedSparsityConfig
as
STConfig
print
(
'deepspeed sparse attention is not set; Fixed sparsity is used as default.'
)
return
STConfig
(
num_heads
=
num_heads
)
else
:
return
None
def
get_sparse_attention_utils
(
sparse_attention_config
):
if
sparse_attention_config
is
not
None
:
from
deepspeed.ops.sparse_attention
import
SparseAttentionUtils
return
SparseAttentionUtils
return
None
def
load_tf_weights_in_bert
(
model
,
tf_checkpoint_path
):
""" Load tf checkpoints in a pytorch model
"""
try
:
import
re
import
numpy
as
np
import
tensorflow
as
tf
except
ImportError
:
print
(
"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path
=
os
.
path
.
abspath
(
tf_checkpoint_path
)
print
(
"Converting TensorFlow checkpoint from {}"
.
format
(
tf_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
print
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
for
name
,
array
in
zip
(
names
,
arrays
):
name
=
name
.
split
(
'/'
)
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if
any
(
n
in
[
"adam_v"
,
"adam_m"
]
for
n
in
name
):
print
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+_\d+'
,
m_name
):
l
=
re
.
split
(
r
'_(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
if
l
[
0
]
==
'kernel'
or
l
[
0
]
==
'gamma'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'output_bias'
or
l
[
0
]
==
'beta'
:
pointer
=
getattr
(
pointer
,
'bias'
)
elif
l
[
0
]
==
'output_weights'
:
pointer
=
getattr
(
pointer
,
'weight'
)
else
:
pointer
=
getattr
(
pointer
,
l
[
0
])
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
if
m_name
[
-
11
:]
==
'_embeddings'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
m_name
==
'kernel'
:
array
=
np
.
transpose
(
array
)
try
:
assert
pointer
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
array
.
shape
)
raise
print
(
"Initialize PyTorch weight {}"
.
format
(
name
))
pointer
.
data
=
torch
.
from_numpy
(
array
)
return
model
@
torch
.
jit
.
script
def
f_gelu
(
x
):
pdtype
=
x
.
dtype
x
=
x
.
float
()
y
=
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
return
y
.
to
(
pdtype
)
@
torch
.
jit
.
script
def
bias_gelu
(
bias
,
y
):
x
=
bias
+
y
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
1.41421
))
@
torch
.
jit
.
script
def
bias_tanh
(
bias
,
y
):
x
=
bias
+
y
return
torch
.
tanh
(
x
)
def
gelu
(
x
):
"""Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return
f_gelu
(
x
)
def
swish
(
x
):
return
x
*
torch
.
sigmoid
(
x
)
ACT2FN
=
{
"gelu"
:
gelu
,
"relu"
:
torch
.
nn
.
functional
.
relu
,
"swish"
:
swish
}
class
LinearActivation
(
Module
):
r
"""Fused Linear and activation Module.
"""
__constants__
=
[
'bias'
]
def
__init__
(
self
,
in_features
,
out_features
,
act
=
'gelu'
,
bias
=
True
):
super
(
LinearActivation
,
self
).
__init__
()
self
.
in_features
=
in_features
self
.
out_features
=
out_features
self
.
fused_gelu
=
False
self
.
fused_tanh
=
False
if
isinstance
(
act
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
act
,
unicode
)):
if
bias
and
act
==
'gelu'
:
self
.
fused_gelu
=
True
elif
bias
and
act
==
'tanh'
:
self
.
fused_tanh
=
True
else
:
self
.
act_fn
=
ACT2FN
[
act
]
else
:
self
.
act_fn
=
act
self
.
weight
=
Parameter
(
torch
.
Tensor
(
out_features
,
in_features
))
if
bias
:
self
.
bias
=
Parameter
(
torch
.
Tensor
(
out_features
))
else
:
self
.
register_parameter
(
'bias'
,
None
)
self
.
reset_parameters
()
def
reset_parameters
(
self
):
init
.
kaiming_uniform_
(
self
.
weight
,
a
=
math
.
sqrt
(
5
))
if
self
.
bias
is
not
None
:
fan_in
,
_
=
init
.
_calculate_fan_in_and_fan_out
(
self
.
weight
)
bound
=
1
/
math
.
sqrt
(
fan_in
)
init
.
uniform_
(
self
.
bias
,
-
bound
,
bound
)
def
forward
(
self
,
input
):
if
self
.
fused_gelu
:
return
bias_gelu
(
self
.
bias
,
F
.
linear
(
input
,
self
.
weight
,
None
))
elif
self
.
fused_tanh
:
return
bias_tanh
(
self
.
bias
,
F
.
linear
(
input
,
self
.
weight
,
None
))
else
:
return
self
.
act_fn
(
F
.
linear
(
input
,
self
.
weight
,
self
.
bias
))
def
extra_repr
(
self
):
return
'in_features={}, out_features={}, bias={}'
.
format
(
self
.
in_features
,
self
.
out_features
,
self
.
bias
is
not
None
)
class
BertConfig
(
object
):
"""Configuration class to store the configuration of a `BertModel`.
"""
def
__init__
(
self
,
vocab_size_or_config_json_file
,
hidden_size
=
768
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
intermediate_size
=
3072
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
2
,
initializer_range
=
0.02
):
"""Constructs BertConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
if
isinstance
(
vocab_size_or_config_json_file
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
vocab_size_or_config_json_file
,
unicode
)):
with
open
(
vocab_size_or_config_json_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
json_config
=
json
.
loads
(
reader
.
read
())
for
key
,
value
in
json_config
.
items
():
self
.
__dict__
[
key
]
=
value
elif
isinstance
(
vocab_size_or_config_json_file
,
int
):
self
.
vocab_size
=
vocab_size_or_config_json_file
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
hidden_act
=
hidden_act
self
.
intermediate_size
=
intermediate_size
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
initializer_range
=
initializer_range
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@
classmethod
def
from_dict
(
cls
,
json_object
):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
config
=
BertConfig
(
vocab_size_or_config_json_file
=-
1
)
for
key
,
value
in
json_object
.
items
():
config
.
__dict__
[
key
]
=
value
return
config
@
classmethod
def
from_json_file
(
cls
,
json_file
):
"""Constructs a `BertConfig` from a json file of parameters."""
with
open
(
json_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
text
=
reader
.
read
()
return
cls
.
from_dict
(
json
.
loads
(
text
))
def
__repr__
(
self
):
return
str
(
self
.
to_json_string
())
def
to_dict
(
self
):
"""Serializes this instance to a Python dictionary."""
output
=
copy
.
deepcopy
(
self
.
__dict__
)
return
output
def
to_json_string
(
self
):
"""Serializes this instance to a JSON string."""
return
json
.
dumps
(
self
.
to_dict
(),
indent
=
2
,
sort_keys
=
True
)
+
"
\n
"
try
:
import
apex
#apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
import
apex.normalization
#apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
BertLayerNorm
=
apex
.
normalization
.
FusedLayerNorm
except
ImportError
:
print
(
"Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex."
)
class
BertLayerNorm
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
,
eps
=
1e-12
):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super
(
BertLayerNorm
,
self
).
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
hidden_size
))
self
.
variance_epsilon
=
eps
def
forward
(
self
,
x
):
pdtype
=
x
.
dtype
x
=
x
.
float
()
u
=
x
.
mean
(
-
1
,
keepdim
=
True
)
s
=
(
x
-
u
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
x
=
(
x
-
u
)
/
torch
.
sqrt
(
s
+
self
.
variance_epsilon
)
return
self
.
weight
*
x
.
to
(
pdtype
)
+
self
.
bias
class
BertEmbeddings
(
nn
.
Module
):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def
__init__
(
self
,
config
):
super
(
BertEmbeddings
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
type_vocab_size
,
config
.
hidden_size
)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
seq_length
=
input_ids
.
size
(
1
)
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
return
embeddings
class
BertSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertSelfAttention
,
self
).
__init__
()
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
config
.
hidden_size
,
config
.
num_attention_heads
))
self
.
num_attention_heads
=
config
.
num_attention_heads
self
.
attention_head_size
=
int
(
config
.
hidden_size
/
config
.
num_attention_heads
)
self
.
all_head_size
=
self
.
num_attention_heads
*
self
.
attention_head_size
self
.
query
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
key
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
value
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
self
.
softmax
=
nn
.
Softmax
(
dim
=-
1
)
def
transpose_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
transpose_key_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
return
x
.
permute
(
0
,
2
,
3
,
1
)
def
forward
(
self
,
hidden_states
,
attention_mask
):
mixed_query_layer
=
self
.
query
(
hidden_states
)
mixed_key_layer
=
self
.
key
(
hidden_states
)
mixed_value_layer
=
self
.
value
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
mixed_query_layer
)
key_layer
=
self
.
transpose_key_for_scores
(
mixed_key_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
)
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores
=
attention_scores
+
attention_mask
pdtype
=
attention_scores
.
dtype
# Normalize the attention scores to probabilities.
attention_probs
=
self
.
softmax
(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,
)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
return
context_layer
class
BertSelfOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertSelfOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dense
.
bert_output_layer
=
True
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
return
hidden_states
class
BertAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertAttention
,
self
).
__init__
()
self
.
self
=
BertSelfAttention
(
config
)
self
.
output
=
BertSelfOutput
(
config
)
def
forward
(
self
,
input_tensor
,
attention_mask
):
self_output
=
self
.
self
(
input_tensor
,
attention_mask
)
attention_output
=
self
.
output
(
self_output
,
input_tensor
)
return
attention_output
class
BertIntermediate
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertIntermediate
,
self
).
__init__
()
self
.
dense_act
=
LinearActivation
(
config
.
hidden_size
,
config
.
intermediate_size
,
act
=
config
.
hidden_act
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense_act
(
hidden_states
)
return
hidden_states
class
BertOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
self
.
dense
.
bert_output_layer
=
True
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
return
hidden_states
class
BertLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertLayer
,
self
).
__init__
()
self
.
attention
=
BertAttention
(
config
)
self
.
PreAttentionLayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
PostAttentionLayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
intermediate
=
BertIntermediate
(
config
)
self
.
output
=
BertOutput
(
config
)
def
forward
(
self
,
hidden_states
,
attention_mask
):
input_layer_norm
=
self
.
PreAttentionLayerNorm
(
hidden_states
)
attention_output
=
self
.
attention
(
input_layer_norm
,
attention_mask
)
intermediate_input
=
hidden_states
+
attention_output
intermediate_layer_norm
=
self
.
PostAttentionLayerNorm
(
intermediate_input
)
intermediate_output
=
self
.
intermediate
(
intermediate_layer_norm
)
layer_output
=
self
.
output
(
intermediate_output
)
return
layer_output
+
intermediate_input
class
BertEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
,
args
,
sparse_attention_config
=
None
):
super
(
BertEncoder
,
self
).
__init__
()
#Added later to make it similar to GPT-2
self
.
FinalLayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
if
args
.
deepspeed_transformer_kernel
and
args
.
deepspeed_sparse_attention
:
raise
NotImplementedError
(
f
'Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!'
)
if
args
.
deepspeed_transformer_kernel
:
from
deepspeed
import
DeepSpeedTransformerLayer
,
DeepSpeedTransformerConfig
ds_config
=
get_deepspeed_config
(
args
)
cuda_config
=
DeepSpeedTransformerConfig
(
batch_size
=
ds_config
.
train_micro_batch_size_per_gpu
,
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
heads
=
config
.
num_attention_heads
,
attn_dropout_ratio
=
config
.
attention_probs_dropout_prob
,
hidden_dropout_ratio
=
config
.
hidden_dropout_prob
,
num_hidden_layers
=
config
.
num_hidden_layers
,
initializer_range
=
config
.
initializer_range
,
local_rank
=
args
.
local_rank
if
hasattr
(
args
,
'local_rank'
)
else
-
1
,
seed
=
args
.
seed
,
fp16
=
ds_config
.
fp16_enabled
,
pre_layer_norm
=
True
,
# attn_dropout_checkpoint=args.attention_dropout_checkpoint,
# normalize_invertible=args.normalize_invertible,
# gelu_checkpoint=args.gelu_checkpoint,
# stochastic_mode=args.stochastic_mode
attn_dropout_checkpoint
=
False
,
normalize_invertible
=
False
,
gelu_checkpoint
=
False
,
stochastic_mode
=
False
)
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
DeepSpeedTransformerLayer
(
cuda_config
))
for
_
in
range
(
config
.
num_hidden_layers
)
])
else
:
layer
=
BertLayer
(
config
)
if
sparse_attention_config
is
not
None
:
from
deepspeed.ops.sparse_attention
import
BertSparseSelfAttention
layer
.
attention
.
self
=
BertSparseSelfAttention
(
config
,
sparsity_config
=
sparse_attention_config
)
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
def
forward
(
self
,
hidden_states
,
attention_mask
,
output_all_encoded_layers
=
True
,
checkpoint_activations
=
False
):
all_encoder_layers
=
[]
def
custom
(
start
,
end
):
def
custom_forward
(
*
inputs
):
layers
=
self
.
layer
[
start
:
end
]
x_
=
inputs
[
0
]
for
layer
in
layers
:
x_
=
layer
(
x_
,
inputs
[
1
])
return
x_
return
custom_forward
if
checkpoint_activations
:
l
=
0
num_layers
=
len
(
self
.
layer
)
chunk_length
=
math
.
ceil
(
math
.
sqrt
(
num_layers
))
while
l
<
num_layers
:
hidden_states
=
checkpoint
.
checkpoint
(
custom
(
l
,
l
+
chunk_length
),
hidden_states
,
attention_mask
*
1
)
l
+=
chunk_length
# decoder layers
else
:
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
)
if
output_all_encoded_layers
:
all_encoder_layers
.
append
(
hidden_states
)
if
not
output_all_encoded_layers
or
checkpoint_activations
:
hidden_states
=
self
.
FinalLayerNorm
(
hidden_states
)
all_encoder_layers
.
append
(
hidden_states
)
return
all_encoder_layers
#class BertEncoder(nn.Module):
# def __init__(self, config):
# super(BertEncoder, self).__init__()
# layer = BertLayer(config)
# self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
#
# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
class
BertPooler
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertPooler
,
self
).
__init__
()
self
.
dense_act
=
LinearActivation
(
config
.
hidden_size
,
config
.
hidden_size
,
act
=
"tanh"
)
def
forward
(
self
,
hidden_states
):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor
=
hidden_states
[:,
0
]
pooled_output
=
self
.
dense_act
(
first_token_tensor
)
return
pooled_output
class
BertPredictionHeadTransform
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertPredictionHeadTransform
,
self
).
__init__
()
self
.
dense_act
=
LinearActivation
(
config
.
hidden_size
,
config
.
hidden_size
,
act
=
config
.
hidden_act
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense_act
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
)
return
hidden_states
class
BertLMPredictionHead
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertLMPredictionHead
,
self
).
__init__
()
self
.
transform
=
BertPredictionHeadTransform
(
config
)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self
.
decoder
=
nn
.
Linear
(
bert_model_embedding_weights
.
size
(
1
),
bert_model_embedding_weights
.
size
(
0
),
bias
=
False
)
self
.
decoder
.
weight
=
bert_model_embedding_weights
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
bert_model_embedding_weights
.
size
(
0
)))
def
forward
(
self
,
hidden_states
,
masked_token_indexes
):
hidden_states
=
self
.
transform
(
hidden_states
)
if
masked_token_indexes
is
not
None
:
hidden_states
=
torch
.
index_select
(
hidden_states
.
view
(
-
1
,
hidden_states
.
shape
[
-
1
]),
0
,
masked_token_indexes
)
torch
.
cuda
.
nvtx
.
range_push
(
"decoder input.size() = {}, weight.size() = {}"
.
format
(
hidden_states
.
size
(),
self
.
decoder
.
weight
.
size
()))
hidden_states
=
self
.
decoder
(
hidden_states
)
+
self
.
bias
torch
.
cuda
.
nvtx
.
range_pop
()
return
hidden_states
class
BertOnlyMLMHead
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertOnlyMLMHead
,
self
).
__init__
()
self
.
predictions
=
BertLMPredictionHead
(
config
,
bert_model_embedding_weights
)
def
forward
(
self
,
sequence_output
):
prediction_scores
=
self
.
predictions
(
sequence_output
)
return
prediction_scores
class
BertOnlyNSPHead
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertOnlyNSPHead
,
self
).
__init__
()
self
.
seq_relationship
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
def
forward
(
self
,
pooled_output
):
seq_relationship_score
=
self
.
seq_relationship
(
pooled_output
)
return
seq_relationship_score
class
BertPreTrainingHeads
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertPreTrainingHeads
,
self
).
__init__
()
self
.
predictions
=
BertLMPredictionHead
(
config
,
bert_model_embedding_weights
)
self
.
seq_relationship
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
def
forward
(
self
,
sequence_output
,
pooled_output
,
masked_token_indexes
=
None
):
prediction_scores
=
self
.
predictions
(
sequence_output
,
masked_token_indexes
)
seq_relationship_score
=
self
.
seq_relationship
(
pooled_output
)
return
prediction_scores
,
seq_relationship_score
class
BertPreTrainedModel
(
nn
.
Module
):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
BertPreTrainedModel
,
self
).
__init__
()
if
not
isinstance
(
config
,
BertConfig
):
raise
ValueError
(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`"
.
format
(
self
.
__class__
.
__name__
,
self
.
__class__
.
__name__
))
self
.
config
=
config
def
init_bert_weights
(
self
,
module
):
""" Initialize the weights.
"""
if
isinstance
(
module
,
(
nn
.
Linear
,
nn
.
Embedding
)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
num_layers
=
self
.
config
.
num_hidden_layers
std
=
self
.
config
.
initializer_range
if
hasattr
(
module
,
'bert_output_layer'
):
# "Accounting for accumulation on the residual path"
#print("Accounting for accumulation on the residual path")
std
=
self
.
config
.
initializer_range
/
math
.
sqrt
(
2.0
*
num_layers
)
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
std
)
elif
isinstance
(
module
,
BertLayerNorm
):
module
.
bias
.
data
.
zero_
()
module
.
weight
.
data
.
fill_
(
1.0
)
if
isinstance
(
module
,
nn
.
Linear
)
and
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
state_dict
=
None
,
cache_dir
=
None
,
from_tf
=
False
,
*
inputs
,
**
kwargs
):
"""
Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
if
pretrained_model_name_or_path
in
PRETRAINED_MODEL_ARCHIVE_MAP
:
archive_file
=
PRETRAINED_MODEL_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
else
:
archive_file
=
pretrained_model_name_or_path
# redirect to the cache, if necessary
try
:
resolved_archive_file
=
cached_path
(
archive_file
,
cache_dir
=
cache_dir
)
except
EnvironmentError
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
()),
archive_file
))
return
None
if
resolved_archive_file
==
archive_file
:
logger
.
info
(
"loading archive file {}"
.
format
(
archive_file
))
else
:
logger
.
info
(
"loading archive file {} from cache at {}"
.
format
(
archive_file
,
resolved_archive_file
))
tempdir
=
None
if
os
.
path
.
isdir
(
resolved_archive_file
)
or
from_tf
:
serialization_dir
=
resolved_archive_file
else
:
# Extract archive to temp dir
tempdir
=
tempfile
.
mkdtemp
()
logger
.
info
(
"extracting archive file {} to temp dir {}"
.
format
(
resolved_archive_file
,
tempdir
))
with
tarfile
.
open
(
resolved_archive_file
,
'r:gz'
)
as
archive
:
archive
.
extractall
(
tempdir
)
serialization_dir
=
tempdir
# Load config
config_file
=
os
.
path
.
join
(
serialization_dir
,
CONFIG_NAME
)
config
=
BertConfig
.
from_json_file
(
config_file
)
logger
.
info
(
"Model config {}"
.
format
(
config
))
# Instantiate model.
model
=
cls
(
config
,
*
inputs
,
**
kwargs
)
if
state_dict
is
None
and
not
from_tf
:
weights_path
=
os
.
path
.
join
(
serialization_dir
,
WEIGHTS_NAME
)
state_dict
=
torch
.
load
(
weights_path
,
map_location
=
'cpu'
if
not
torch
.
cuda
.
is_available
()
else
None
)
if
tempdir
:
# Clean up temp dir
shutil
.
rmtree
(
tempdir
)
if
from_tf
:
# Directly load from a TensorFlow checkpoint
weights_path
=
os
.
path
.
join
(
serialization_dir
,
TF_WEIGHTS_NAME
)
return
load_tf_weights_in_bert
(
model
,
weights_path
)
# Load from a PyTorch state_dict
old_keys
=
[]
new_keys
=
[]
for
key
in
state_dict
.
keys
():
new_key
=
None
if
'gamma'
in
key
:
new_key
=
key
.
replace
(
'gamma'
,
'weight'
)
if
'beta'
in
key
:
new_key
=
key
.
replace
(
'beta'
,
'bias'
)
if
new_key
:
old_keys
.
append
(
key
)
new_keys
.
append
(
new_key
)
for
old_key
,
new_key
in
zip
(
old_keys
,
new_keys
):
state_dict
[
new_key
]
=
state_dict
.
pop
(
old_key
)
missing_keys
=
[]
unexpected_keys
=
[]
error_msgs
=
[]
# copy state_dict so _load_from_state_dict can modify it
metadata
=
getattr
(
state_dict
,
'_metadata'
,
None
)
state_dict
=
state_dict
.
copy
()
if
metadata
is
not
None
:
state_dict
.
_metadata
=
metadata
def
load
(
module
,
prefix
=
''
):
local_metadata
=
{}
if
metadata
is
None
else
metadata
.
get
(
prefix
[:
-
1
],
{})
module
.
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
True
,
missing_keys
,
unexpected_keys
,
error_msgs
)
for
name
,
child
in
module
.
_modules
.
items
():
if
child
is
not
None
:
load
(
child
,
prefix
+
name
+
'.'
)
start_prefix
=
''
if
not
hasattr
(
model
,
'bert'
)
and
any
(
s
.
startswith
(
'bert.'
)
for
s
in
state_dict
.
keys
()):
start_prefix
=
'bert.'
load
(
model
,
prefix
=
start_prefix
)
if
len
(
missing_keys
)
>
0
:
logger
.
info
(
"Weights of {} not initialized from pretrained model: {}"
.
format
(
model
.
__class__
.
__name__
,
missing_keys
))
if
len
(
unexpected_keys
)
>
0
:
logger
.
info
(
"Weights from pretrained model not used in {}: {}"
.
format
(
model
.
__class__
.
__name__
,
unexpected_keys
))
if
len
(
error_msgs
)
>
0
:
raise
RuntimeError
(
'Error(s) in loading state_dict for {}:
\n\t
{}'
.
format
(
model
.
__class__
.
__name__
,
"
\n\t
"
.
join
(
error_msgs
)))
return
model
class
BertModel
(
BertPreTrainedModel
):
"""BERT model ("Bidirectional Embedding Representations from a Transformer").
Params:
config: a BertConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.BertModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
args
=
None
):
super
(
BertModel
,
self
).
__init__
(
config
)
self
.
embeddings
=
BertEmbeddings
(
config
)
# set pad_token_id that is used for sparse attention padding
self
.
pad_token_id
=
config
.
pad_token_id
if
hasattr
(
config
,
'pad_token_id'
)
and
config
.
pad_token_id
is
not
None
else
0
# set sparse_attention_config if it has been selected
self
.
sparse_attention_config
=
get_sparse_attention_config
(
args
,
config
.
num_attention_heads
)
self
.
sparse_attention_utils
=
get_sparse_attention_utils
(
self
.
sparse_attention_config
)
self
.
encoder
=
BertEncoder
(
config
,
args
,
sparse_attention_config
=
self
.
sparse_attention_config
)
self
.
pooler
=
BertPooler
(
config
)
self
.
apply
(
self
.
init_bert_weights
)
logger
.
info
(
"Init BERT pretrain model"
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
output_all_encoded_layers
=
True
,
checkpoint_activations
=
False
):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask
=
extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
# If BertEncoder uses sparse attention, it needs to be padded based on the sparse attention block size
if
self
.
sparse_attention_config
is
not
None
:
pad_len
,
input_ids
,
attention_mask
,
token_type_ids
,
position_ids
,
inputs_embeds
=
self
.
sparse_attention_utils
.
pad_to_block_size
(
block_size
=
self
.
sparse_attention_config
.
block
,
input_ids
=
input_ids
,
attention_mask
=
extended_attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
None
,
inputs_embeds
=
None
,
pad_token_id
=
self
.
pad_token_id
,
model_mbeddings
=
self
.
embeddings
)
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
encoded_layers
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
,
output_all_encoded_layers
=
output_all_encoded_layers
,
checkpoint_activations
=
checkpoint_activations
)
sequence_output
=
encoded_layers
[
-
1
]
pooled_output
=
self
.
pooler
(
sequence_output
)
# If BertEncoder uses sparse attention, and input_ids were padded, sequence output needs to be unpadded to original length
if
self
.
sparse_attention_config
is
not
None
and
pad_len
>
0
:
encoded_layers
[
-
1
]
=
self
.
sparse_attention_utils
.
unpad_sequence_output
(
pad_len
,
encoded_layers
[
-
1
])
if
not
output_all_encoded_layers
:
encoded_layers
=
encoded_layers
[
-
1
]
return
encoded_layers
,
pooled_output
class
BertForPreTrainingPreLN
(
BertPreTrainedModel
):
"""BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads:
- the masked language modeling head, and
- the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `masked_lm_labels` and `next_sentence_label` are not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `masked_lm_labels` or `next_sentence_label` is `None`:
Outputs a tuple comprising
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForPreTraining(config)
masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
args
):
super
(
BertForPreTrainingPreLN
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
,
args
)
self
.
cls
=
BertPreTrainingHeads
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
apply
(
self
.
init_bert_weights
)
self
.
args
=
args
def
forward
(
self
,
batch
,
log
=
True
):
input_ids
=
batch
[
1
]
token_type_ids
=
batch
[
3
]
attention_mask
=
batch
[
2
]
masked_lm_labels
=
batch
[
5
]
next_sentence_label
=
batch
[
4
]
checkpoint_activations
=
False
sequence_output
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
,
checkpoint_activations
=
checkpoint_activations
)
if
masked_lm_labels
is
not
None
and
next_sentence_label
is
not
None
:
# filter out all masked labels.
masked_token_indexes
=
torch
.
nonzero
(
(
masked_lm_labels
+
1
).
view
(
-
1
)).
view
(
-
1
)
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
,
masked_token_indexes
)
target
=
torch
.
index_select
(
masked_lm_labels
.
view
(
-
1
),
0
,
masked_token_indexes
)
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
target
)
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
total_loss
=
masked_lm_loss
+
next_sentence_loss
return
total_loss
else
:
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
)
return
prediction_scores
,
seq_relationship_score
class
BertForMaskedLM
(
BertPreTrainedModel
):
"""BERT model with the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
Outputs:
if `masked_lm_labels` is not `None`:
Outputs the masked language modeling loss.
if `masked_lm_labels` is `None`:
Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForMaskedLM(config)
masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForMaskedLM
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertOnlyMLMHead
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
checkpoint_activations
=
False
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
prediction_scores
=
self
.
cls
(
sequence_output
)
if
masked_lm_labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
return
masked_lm_loss
else
:
return
prediction_scores
class
BertForNextSentencePrediction
(
BertPreTrainedModel
):
"""BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `next_sentence_label` is not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `next_sentence_label` is `None`:
Outputs the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForNextSentencePrediction(config)
seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForNextSentencePrediction
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertOnlyNSPHead
(
config
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
next_sentence_label
=
None
,
checkpoint_activations
=
False
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
seq_relationship_score
=
self
.
cls
(
pooled_output
)
if
next_sentence_label
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
return
next_sentence_loss
else
:
return
seq_relationship_score
class
BertForSequenceClassification
(
BertPreTrainedModel
):
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForSequenceClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
args
,
config
,
num_labels
):
super
(
BertForSequenceClassification
,
self
).
__init__
(
config
)
self
.
num_labels
=
num_labels
self
.
bert
=
BertModel
(
config
,
args
=
args
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
checkpoint_activations
=
False
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
return
loss
else
:
return
logits
class
BertForMultipleChoice
(
BertPreTrainedModel
):
"""BERT model for multiple choice tasks.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_choices`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_choices = 2
model = BertForMultipleChoice(config, num_choices)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
num_choices
):
super
(
BertForMultipleChoice
,
self
).
__init__
(
config
)
self
.
num_choices
=
num_choices
self
.
bert
=
BertModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
1
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
checkpoint_activations
=
False
):
flat_input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
flat_token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
flat_attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
_
,
pooled_output
=
self
.
bert
(
flat_input_ids
,
flat_token_type_ids
,
flat_attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
reshaped_logits
=
logits
.
view
(
-
1
,
self
.
num_choices
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
reshaped_logits
,
labels
)
return
loss
else
:
return
reshaped_logits
class
BertForTokenClassification
(
BertPreTrainedModel
):
"""BERT model for token-level classification.
This module is composed of the BERT model with a linear layer on top of
the full hidden state of the last layer.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForTokenClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
num_labels
):
super
(
BertForTokenClassification
,
self
).
__init__
(
config
)
self
.
num_labels
=
num_labels
self
.
bert
=
BertModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
checkpoint_activations
=
False
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
sequence_output
=
self
.
dropout
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
# Only keep active parts of the loss
if
attention_mask
is
not
None
:
active_loss
=
attention_mask
.
view
(
-
1
)
==
1
active_logits
=
logits
.
view
(
-
1
,
self
.
num_labels
)[
active_loss
]
active_labels
=
labels
.
view
(
-
1
)[
active_loss
]
loss
=
loss_fct
(
active_logits
,
active_labels
)
else
:
loss
=
loss_fct
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
return
loss
else
:
return
logits
class
BertForQuestionAnswering
(
BertPreTrainedModel
):
"""BERT model for Question Answering (span extraction).
This module is composed of the BERT model with a linear layer on top of
the sequence output that computes start_logits and end_logits
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
Outputs:
if `start_positions` and `end_positions` are not `None`:
Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
if `start_positions` or `end_positions` is `None`:
Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
position tokens of shape [batch_size, sequence_length].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForQuestionAnswering(config)
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForQuestionAnswering
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
# TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
self
.
qa_outputs
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
checkpoint_activations
=
False
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
logits
=
self
.
qa_outputs
(
sequence_output
)
start_logits
,
end_logits
=
logits
.
split
(
1
,
dim
=-
1
)
start_logits
=
start_logits
.
squeeze
(
-
1
)
end_logits
=
end_logits
.
squeeze
(
-
1
)
if
start_positions
is
not
None
and
end_positions
is
not
None
:
# If we are on multi-GPU, split add a dimension
if
len
(
start_positions
.
size
())
>
1
:
start_positions
=
start_positions
.
squeeze
(
-
1
)
if
len
(
end_positions
.
size
())
>
1
:
end_positions
=
end_positions
.
squeeze
(
-
1
)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index
=
start_logits
.
size
(
1
)
start_positions
.
clamp_
(
0
,
ignored_index
)
end_positions
.
clamp_
(
0
,
ignored_index
)
loss_fct
=
CrossEntropyLoss
(
ignore_index
=
ignored_index
)
start_loss
=
loss_fct
(
start_logits
,
start_positions
)
end_loss
=
loss_fct
(
end_logits
,
end_positions
)
total_loss
=
(
start_loss
+
end_loss
)
/
2
return
total_loss
else
:
return
start_logits
,
end_logits
Deepspeed/BingBertGlue/nvidia/modelingpreln_layerdrop.py
0 → 100644
View file @
316d3f90
# DeepSpeed note, code taken from commit 3d59216cec89a363649b4fe3d15295ba936ced0f
# https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/modeling.py
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
copy
import
json
import
logging
import
math
import
os
import
shutil
import
tarfile
import
tempfile
import
sys
from
io
import
open
import
torch
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
from
torch.utils
import
checkpoint
from
turing.file_utils
import
cached_path
from
torch.nn
import
Module
from
torch.nn.parameter
import
Parameter
import
torch.nn.functional
as
F
import
torch.nn.init
as
init
import
numpy
as
np
logger
=
logging
.
getLogger
(
__name__
)
PRETRAINED_MODEL_ARCHIVE_MAP
=
{
'bert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz"
,
'bert-large-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz"
,
'bert-base-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz"
,
'bert-large-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz"
,
'bert-base-multilingual-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz"
,
'bert-base-multilingual-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz"
,
'bert-base-chinese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz"
,
}
CONFIG_NAME
=
'bert_config.json'
WEIGHTS_NAME
=
'pytorch_model.bin'
TF_WEIGHTS_NAME
=
'model.ckpt'
def
get_deepspeed_config
(
args
):
if
hasattr
(
args
,
'deepspeed_config'
)
and
args
.
deepspeed_config
:
from
deepspeed
import
DeepSpeedConfig
return
DeepSpeedConfig
(
args
.
deepspeed_config
)
else
:
raise
RuntimeError
(
'deepspeed_config is not found in args.'
)
def
get_sparse_attention_config
(
args
,
num_heads
):
if
args
.
deepspeed_sparse_attention
:
ds_config
=
get_deepspeed_config
(
args
)
if
hasattr
(
ds_config
,
'sparse_attention'
)
and
ds_config
.
sparse_attention
:
sa_config
=
ds_config
.
sparse_attention
sa_mode
=
sa_config
.
get
(
'mode'
)
if
(
sa_mode
==
'dense'
):
from
deepspeed.ops.sparse_attention
import
DenseSparsityConfig
as
STConfig
elif
(
sa_mode
==
'fixed'
):
from
deepspeed.ops.sparse_attention
import
FixedSparsityConfig
as
STConfig
elif
(
sa_mode
==
'bigbird'
):
from
deepspeed.ops.sparse_attention
import
BigBirdSparsityConfig
as
STConfig
elif
(
sa_mode
==
'bslongformer'
):
from
deepspeed.ops.sparse_attention
import
BSLongformerSparsityConfig
as
STConfig
elif
(
sa_mode
==
'variable'
):
from
deepspeed.ops.sparse_attention
import
VariableSparsityConfig
as
STConfig
else
:
raise
NotImplementedError
(
f
'Given sparsity mode,
{
sa_mode
}
, has not been implemented yet!'
)
del
sa_config
[
'mode'
]
return
STConfig
(
num_heads
=
num_heads
,
**
sa_config
)
else
:
from
deepspeed.ops.sparse_attention
import
FixedSparsityConfig
as
STConfig
print
(
'deepspeed sparse attention is not set; Fixed sparsity is used as default.'
)
return
STConfig
(
num_heads
=
num_heads
)
else
:
return
None
def
get_sparse_attention_utils
(
sparse_attention_config
):
if
sparse_attention_config
is
not
None
:
from
deepspeed.ops.sparse_attention
import
SparseAttentionUtils
return
SparseAttentionUtils
return
None
def
load_tf_weights_in_bert
(
model
,
tf_checkpoint_path
):
""" Load tf checkpoints in a pytorch model
"""
try
:
import
re
import
numpy
as
np
import
tensorflow
as
tf
except
ImportError
:
print
(
"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
tf_path
=
os
.
path
.
abspath
(
tf_checkpoint_path
)
print
(
"Converting TensorFlow checkpoint from {}"
.
format
(
tf_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
print
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
for
name
,
array
in
zip
(
names
,
arrays
):
name
=
name
.
split
(
'/'
)
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if
any
(
n
in
[
"adam_v"
,
"adam_m"
]
for
n
in
name
):
print
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+_\d+'
,
m_name
):
l
=
re
.
split
(
r
'_(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
if
l
[
0
]
==
'kernel'
or
l
[
0
]
==
'gamma'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'output_bias'
or
l
[
0
]
==
'beta'
:
pointer
=
getattr
(
pointer
,
'bias'
)
elif
l
[
0
]
==
'output_weights'
:
pointer
=
getattr
(
pointer
,
'weight'
)
else
:
pointer
=
getattr
(
pointer
,
l
[
0
])
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
if
m_name
[
-
11
:]
==
'_embeddings'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
m_name
==
'kernel'
:
array
=
np
.
transpose
(
array
)
try
:
assert
pointer
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
array
.
shape
)
raise
print
(
"Initialize PyTorch weight {}"
.
format
(
name
))
pointer
.
data
=
torch
.
from_numpy
(
array
)
return
model
@
torch
.
jit
.
script
def
f_gelu
(
x
):
pdtype
=
x
.
dtype
x
=
x
.
float
()
y
=
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
return
y
.
to
(
pdtype
)
@
torch
.
jit
.
script
def
bias_gelu
(
bias
,
y
):
x
=
bias
+
y
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
1.41421
))
@
torch
.
jit
.
script
def
bias_tanh
(
bias
,
y
):
x
=
bias
+
y
return
torch
.
tanh
(
x
)
def
gelu
(
x
):
"""Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return
f_gelu
(
x
)
def
swish
(
x
):
return
x
*
torch
.
sigmoid
(
x
)
ACT2FN
=
{
"gelu"
:
gelu
,
"relu"
:
torch
.
nn
.
functional
.
relu
,
"swish"
:
swish
}
class
LinearActivation
(
Module
):
r
"""Fused Linear and activation Module.
"""
__constants__
=
[
'bias'
]
def
__init__
(
self
,
in_features
,
out_features
,
act
=
'gelu'
,
bias
=
True
):
super
(
LinearActivation
,
self
).
__init__
()
self
.
in_features
=
in_features
self
.
out_features
=
out_features
self
.
fused_gelu
=
False
self
.
fused_tanh
=
False
if
isinstance
(
act
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
act
,
unicode
)):
if
bias
and
act
==
'gelu'
:
self
.
fused_gelu
=
True
elif
bias
and
act
==
'tanh'
:
self
.
fused_tanh
=
True
else
:
self
.
act_fn
=
ACT2FN
[
act
]
else
:
self
.
act_fn
=
act
self
.
weight
=
Parameter
(
torch
.
Tensor
(
out_features
,
in_features
))
if
bias
:
self
.
bias
=
Parameter
(
torch
.
Tensor
(
out_features
))
else
:
self
.
register_parameter
(
'bias'
,
None
)
self
.
reset_parameters
()
def
reset_parameters
(
self
):
init
.
kaiming_uniform_
(
self
.
weight
,
a
=
math
.
sqrt
(
5
))
if
self
.
bias
is
not
None
:
fan_in
,
_
=
init
.
_calculate_fan_in_and_fan_out
(
self
.
weight
)
bound
=
1
/
math
.
sqrt
(
fan_in
)
init
.
uniform_
(
self
.
bias
,
-
bound
,
bound
)
def
forward
(
self
,
input
):
if
self
.
fused_gelu
:
return
bias_gelu
(
self
.
bias
,
F
.
linear
(
input
,
self
.
weight
,
None
))
elif
self
.
fused_tanh
:
return
bias_tanh
(
self
.
bias
,
F
.
linear
(
input
,
self
.
weight
,
None
))
else
:
return
self
.
act_fn
(
F
.
linear
(
input
,
self
.
weight
,
self
.
bias
))
def
extra_repr
(
self
):
return
'in_features={}, out_features={}, bias={}'
.
format
(
self
.
in_features
,
self
.
out_features
,
self
.
bias
is
not
None
)
class
BertConfig
(
object
):
"""Configuration class to store the configuration of a `BertModel`.
"""
def
__init__
(
self
,
vocab_size_or_config_json_file
,
hidden_size
=
768
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
intermediate_size
=
3072
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
2
,
initializer_range
=
0.02
):
"""Constructs BertConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
if
isinstance
(
vocab_size_or_config_json_file
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
vocab_size_or_config_json_file
,
unicode
)):
with
open
(
vocab_size_or_config_json_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
json_config
=
json
.
loads
(
reader
.
read
())
for
key
,
value
in
json_config
.
items
():
self
.
__dict__
[
key
]
=
value
elif
isinstance
(
vocab_size_or_config_json_file
,
int
):
self
.
vocab_size
=
vocab_size_or_config_json_file
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
hidden_act
=
hidden_act
self
.
intermediate_size
=
intermediate_size
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
initializer_range
=
initializer_range
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@
classmethod
def
from_dict
(
cls
,
json_object
):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
config
=
BertConfig
(
vocab_size_or_config_json_file
=-
1
)
for
key
,
value
in
json_object
.
items
():
config
.
__dict__
[
key
]
=
value
return
config
@
classmethod
def
from_json_file
(
cls
,
json_file
):
"""Constructs a `BertConfig` from a json file of parameters."""
with
open
(
json_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
text
=
reader
.
read
()
return
cls
.
from_dict
(
json
.
loads
(
text
))
def
__repr__
(
self
):
return
str
(
self
.
to_json_string
())
def
to_dict
(
self
):
"""Serializes this instance to a Python dictionary."""
output
=
copy
.
deepcopy
(
self
.
__dict__
)
return
output
def
to_json_string
(
self
):
"""Serializes this instance to a JSON string."""
return
json
.
dumps
(
self
.
to_dict
(),
indent
=
2
,
sort_keys
=
True
)
+
"
\n
"
try
:
import
apex
#apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
import
apex.normalization
#apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
BertLayerNorm
=
apex
.
normalization
.
FusedLayerNorm
except
ImportError
:
print
(
"Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex."
)
class
BertLayerNorm
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
,
eps
=
1e-12
):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super
(
BertLayerNorm
,
self
).
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
hidden_size
))
self
.
variance_epsilon
=
eps
def
forward
(
self
,
x
):
pdtype
=
x
.
dtype
x
=
x
.
float
()
u
=
x
.
mean
(
-
1
,
keepdim
=
True
)
s
=
(
x
-
u
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
x
=
(
x
-
u
)
/
torch
.
sqrt
(
s
+
self
.
variance_epsilon
)
return
self
.
weight
*
x
.
to
(
pdtype
)
+
self
.
bias
class
BertEmbeddings
(
nn
.
Module
):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def
__init__
(
self
,
config
):
super
(
BertEmbeddings
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
type_vocab_size
,
config
.
hidden_size
)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
seq_length
=
input_ids
.
size
(
1
)
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
return
embeddings
class
BertSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertSelfAttention
,
self
).
__init__
()
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
config
.
hidden_size
,
config
.
num_attention_heads
))
self
.
num_attention_heads
=
config
.
num_attention_heads
self
.
attention_head_size
=
int
(
config
.
hidden_size
/
config
.
num_attention_heads
)
self
.
all_head_size
=
self
.
num_attention_heads
*
self
.
attention_head_size
self
.
query
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
key
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
value
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
self
.
softmax
=
nn
.
Softmax
(
dim
=-
1
)
def
transpose_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
transpose_key_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
return
x
.
permute
(
0
,
2
,
3
,
1
)
def
forward
(
self
,
hidden_states
,
attention_mask
):
mixed_query_layer
=
self
.
query
(
hidden_states
)
mixed_key_layer
=
self
.
key
(
hidden_states
)
mixed_value_layer
=
self
.
value
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
mixed_query_layer
)
key_layer
=
self
.
transpose_key_for_scores
(
mixed_key_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
)
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores
=
attention_scores
+
attention_mask
pdtype
=
attention_scores
.
dtype
# Normalize the attention scores to probabilities.
attention_probs
=
self
.
softmax
(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,
)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
return
context_layer
class
BertSelfOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertSelfOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dense
.
bert_output_layer
=
True
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
return
hidden_states
class
BertAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertAttention
,
self
).
__init__
()
self
.
self
=
BertSelfAttention
(
config
)
self
.
output
=
BertSelfOutput
(
config
)
def
forward
(
self
,
input_tensor
,
attention_mask
):
self_output
=
self
.
self
(
input_tensor
,
attention_mask
)
attention_output
=
self
.
output
(
self_output
,
input_tensor
)
return
attention_output
class
BertIntermediate
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertIntermediate
,
self
).
__init__
()
self
.
dense_act
=
LinearActivation
(
config
.
hidden_size
,
config
.
intermediate_size
,
act
=
config
.
hidden_act
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense_act
(
hidden_states
)
return
hidden_states
class
BertOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
self
.
dense
.
bert_output_layer
=
True
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
return
hidden_states
class
BertLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertLayer
,
self
).
__init__
()
self
.
attention
=
BertAttention
(
config
)
self
.
PreAttentionLayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
PostAttentionLayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
intermediate
=
BertIntermediate
(
config
)
self
.
output
=
BertOutput
(
config
)
def
forward
(
self
,
hidden_states
,
attention_mask
,
action
=
1
,
keep_prob
=
1.0
):
if
action
==
0
:
intermediate_input
=
hidden_states
else
:
input_layer_norm
=
self
.
PreAttentionLayerNorm
(
hidden_states
)
attention_output
=
self
.
attention
(
input_layer_norm
,
attention_mask
)
attention_output
=
attention_output
*
1
/
keep_prob
intermediate_input
=
hidden_states
+
attention_output
if
action
==
0
:
layer_output
=
intermediate_input
else
:
intermediate_layer_norm
=
self
.
PostAttentionLayerNorm
(
intermediate_input
)
intermediate_output
=
self
.
intermediate
(
intermediate_layer_norm
)
layer_output
=
self
.
output
(
intermediate_output
)
layer_output
=
layer_output
*
1
/
keep_prob
layer_output
=
layer_output
+
intermediate_input
return
layer_output
class
BertEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
,
args
,
sparse_attention_config
=
None
):
super
(
BertEncoder
,
self
).
__init__
()
#Added later to make it similar to GPT-2
self
.
FinalLayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
if
args
.
deepspeed_transformer_kernel
and
args
.
deepspeed_sparse_attention
:
raise
NotImplementedError
(
f
'Currently DeepSpeed Transformer Kernels do not support Sparse Attention. To use Sparse Attention, you need to disable Transformer Kernels!'
)
if
args
.
deepspeed_transformer_kernel
:
from
deepspeed
import
DeepSpeedTransformerLayer
,
DeepSpeedTransformerConfig
ds_config
=
get_deepspeed_config
(
args
)
cuda_config
=
DeepSpeedTransformerConfig
(
batch_size
=
ds_config
.
train_micro_batch_size_per_gpu
,
max_seq_length
=
args
.
max_seq_length
,
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
heads
=
config
.
num_attention_heads
,
attn_dropout_ratio
=
config
.
attention_probs_dropout_prob
,
hidden_dropout_ratio
=
config
.
hidden_dropout_prob
,
num_hidden_layers
=
config
.
num_hidden_layers
,
initializer_range
=
config
.
initializer_range
,
local_rank
=
args
.
local_rank
if
hasattr
(
args
,
'local_rank'
)
else
-
1
,
seed
=
args
.
seed
,
fp16
=
ds_config
.
fp16_enabled
,
pre_layer_norm
=
True
,
attn_dropout_checkpoint
=
args
.
attention_dropout_checkpoint
,
normalize_invertible
=
args
.
normalize_invertible
,
gelu_checkpoint
=
args
.
gelu_checkpoint
,
stochastic_mode
=
args
.
stochastic_mode
)
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
DeepSpeedTransformerLayer
(
i
,
cuda_config
))
for
i
in
range
(
config
.
num_hidden_layers
)
])
else
:
layer
=
BertLayer
(
config
)
if
sparse_attention_config
is
not
None
:
from
deepspeed.ops.sparse_attention
import
BertSparseSelfAttention
layer
.
attention
.
self
=
BertSparseSelfAttention
(
config
,
sparsity_config
=
sparse_attention_config
)
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
def
forward
(
self
,
hidden_states
,
attention_mask
,
output_all_encoded_layers
=
True
,
checkpoint_activations
=
False
,
progressive_layer_drop
=
False
,
theta
=
0.5
):
all_encoder_layers
=
[]
def
custom
(
start
,
end
):
def
custom_forward
(
*
inputs
):
layers
=
self
.
layer
[
start
:
end
]
x_
=
inputs
[
0
]
for
layer
in
layers
:
x_
=
layer
(
x_
,
inputs
[
1
])
return
x_
return
custom_forward
if
checkpoint_activations
:
l
=
0
num_layers
=
len
(
self
.
layer
)
chunk_length
=
math
.
ceil
(
math
.
sqrt
(
num_layers
))
while
l
<
num_layers
:
hidden_states
=
checkpoint
.
checkpoint
(
custom
(
l
,
l
+
chunk_length
),
hidden_states
,
attention_mask
*
1
)
l
+=
chunk_length
# decoder layers
else
:
if
not
progressive_layer_drop
:
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
)
if
output_all_encoded_layers
:
all_encoder_layers
.
append
(
hidden_states
)
else
:
drop_prob
=
1
-
theta
step
=
drop_prob
/
len
(
self
.
layer
)
p
=
1.0
# print("+ stochastic drop, depth, Theta {}:".format(theta))
for
i
,
layer_module
in
enumerate
(
self
.
layer
):
action
=
np
.
random
.
choice
([
1
,
0
],
p
=
[
p
,
1
-
p
])
p
=
p
-
step
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
,
action
,
p
)
if
output_all_encoded_layers
:
all_encoder_layers
.
append
(
hidden_states
)
if
not
output_all_encoded_layers
or
checkpoint_activations
:
hidden_states
=
self
.
FinalLayerNorm
(
hidden_states
)
all_encoder_layers
.
append
(
hidden_states
)
return
all_encoder_layers
#class BertEncoder(nn.Module):
# def __init__(self, config):
# super(BertEncoder, self).__init__()
# layer = BertLayer(config)
# self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
#
# def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
# all_encoder_layers = []
# for layer_module in self.layer:
# hidden_states = layer_module(hidden_states, attention_mask)
# if output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# if not output_all_encoded_layers:
# all_encoder_layers.append(hidden_states)
# return all_encoder_layers
class
BertPooler
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertPooler
,
self
).
__init__
()
self
.
dense_act
=
LinearActivation
(
config
.
hidden_size
,
config
.
hidden_size
,
act
=
"tanh"
)
def
forward
(
self
,
hidden_states
):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor
=
hidden_states
[:,
0
]
pooled_output
=
self
.
dense_act
(
first_token_tensor
)
return
pooled_output
class
BertPredictionHeadTransform
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertPredictionHeadTransform
,
self
).
__init__
()
self
.
dense_act
=
LinearActivation
(
config
.
hidden_size
,
config
.
hidden_size
,
act
=
config
.
hidden_act
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense_act
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
)
return
hidden_states
class
BertLMPredictionHead
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertLMPredictionHead
,
self
).
__init__
()
self
.
transform
=
BertPredictionHeadTransform
(
config
)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self
.
decoder
=
nn
.
Linear
(
bert_model_embedding_weights
.
size
(
1
),
bert_model_embedding_weights
.
size
(
0
),
bias
=
False
)
self
.
decoder
.
weight
=
bert_model_embedding_weights
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
bert_model_embedding_weights
.
size
(
0
)))
def
forward
(
self
,
hidden_states
,
masked_token_indexes
):
hidden_states
=
self
.
transform
(
hidden_states
)
if
masked_token_indexes
is
not
None
:
hidden_states
=
torch
.
index_select
(
hidden_states
.
view
(
-
1
,
hidden_states
.
shape
[
-
1
]),
0
,
masked_token_indexes
)
torch
.
cuda
.
nvtx
.
range_push
(
"decoder input.size() = {}, weight.size() = {}"
.
format
(
hidden_states
.
size
(),
self
.
decoder
.
weight
.
size
()))
hidden_states
=
self
.
decoder
(
hidden_states
)
+
self
.
bias
torch
.
cuda
.
nvtx
.
range_pop
()
return
hidden_states
class
BertOnlyMLMHead
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertOnlyMLMHead
,
self
).
__init__
()
self
.
predictions
=
BertLMPredictionHead
(
config
,
bert_model_embedding_weights
)
def
forward
(
self
,
sequence_output
):
prediction_scores
=
self
.
predictions
(
sequence_output
)
return
prediction_scores
class
BertOnlyNSPHead
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertOnlyNSPHead
,
self
).
__init__
()
self
.
seq_relationship
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
def
forward
(
self
,
pooled_output
):
seq_relationship_score
=
self
.
seq_relationship
(
pooled_output
)
return
seq_relationship_score
class
BertPreTrainingHeads
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertPreTrainingHeads
,
self
).
__init__
()
self
.
predictions
=
BertLMPredictionHead
(
config
,
bert_model_embedding_weights
)
self
.
seq_relationship
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
def
forward
(
self
,
sequence_output
,
pooled_output
,
masked_token_indexes
=
None
):
prediction_scores
=
self
.
predictions
(
sequence_output
,
masked_token_indexes
)
seq_relationship_score
=
self
.
seq_relationship
(
pooled_output
)
return
prediction_scores
,
seq_relationship_score
class
BertPreTrainedModel
(
nn
.
Module
):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
BertPreTrainedModel
,
self
).
__init__
()
if
not
isinstance
(
config
,
BertConfig
):
raise
ValueError
(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`"
.
format
(
self
.
__class__
.
__name__
,
self
.
__class__
.
__name__
))
self
.
config
=
config
def
init_bert_weights
(
self
,
module
):
""" Initialize the weights.
"""
if
isinstance
(
module
,
(
nn
.
Linear
,
nn
.
Embedding
)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
num_layers
=
self
.
config
.
num_hidden_layers
std
=
self
.
config
.
initializer_range
if
hasattr
(
module
,
'bert_output_layer'
):
# "Accounting for accumulation on the residual path"
#print("Accounting for accumulation on the residual path")
std
=
self
.
config
.
initializer_range
/
math
.
sqrt
(
2.0
*
num_layers
)
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
std
)
elif
isinstance
(
module
,
BertLayerNorm
):
module
.
bias
.
data
.
zero_
()
module
.
weight
.
data
.
fill_
(
1.0
)
if
isinstance
(
module
,
nn
.
Linear
)
and
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
state_dict
=
None
,
cache_dir
=
None
,
from_tf
=
False
,
*
inputs
,
**
kwargs
):
"""
Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
if
pretrained_model_name_or_path
in
PRETRAINED_MODEL_ARCHIVE_MAP
:
archive_file
=
PRETRAINED_MODEL_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
else
:
archive_file
=
pretrained_model_name_or_path
# redirect to the cache, if necessary
try
:
resolved_archive_file
=
cached_path
(
archive_file
,
cache_dir
=
cache_dir
)
except
EnvironmentError
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
()),
archive_file
))
return
None
if
resolved_archive_file
==
archive_file
:
logger
.
info
(
"loading archive file {}"
.
format
(
archive_file
))
else
:
logger
.
info
(
"loading archive file {} from cache at {}"
.
format
(
archive_file
,
resolved_archive_file
))
tempdir
=
None
if
os
.
path
.
isdir
(
resolved_archive_file
)
or
from_tf
:
serialization_dir
=
resolved_archive_file
else
:
# Extract archive to temp dir
tempdir
=
tempfile
.
mkdtemp
()
logger
.
info
(
"extracting archive file {} to temp dir {}"
.
format
(
resolved_archive_file
,
tempdir
))
with
tarfile
.
open
(
resolved_archive_file
,
'r:gz'
)
as
archive
:
archive
.
extractall
(
tempdir
)
serialization_dir
=
tempdir
# Load config
config_file
=
os
.
path
.
join
(
serialization_dir
,
CONFIG_NAME
)
config
=
BertConfig
.
from_json_file
(
config_file
)
logger
.
info
(
"Model config {}"
.
format
(
config
))
# Instantiate model.
model
=
cls
(
config
,
*
inputs
,
**
kwargs
)
if
state_dict
is
None
and
not
from_tf
:
weights_path
=
os
.
path
.
join
(
serialization_dir
,
WEIGHTS_NAME
)
state_dict
=
torch
.
load
(
weights_path
,
map_location
=
'cpu'
if
not
torch
.
cuda
.
is_available
()
else
None
)
if
tempdir
:
# Clean up temp dir
shutil
.
rmtree
(
tempdir
)
if
from_tf
:
# Directly load from a TensorFlow checkpoint
weights_path
=
os
.
path
.
join
(
serialization_dir
,
TF_WEIGHTS_NAME
)
return
load_tf_weights_in_bert
(
model
,
weights_path
)
# Load from a PyTorch state_dict
old_keys
=
[]
new_keys
=
[]
for
key
in
state_dict
.
keys
():
new_key
=
None
if
'gamma'
in
key
:
new_key
=
key
.
replace
(
'gamma'
,
'weight'
)
if
'beta'
in
key
:
new_key
=
key
.
replace
(
'beta'
,
'bias'
)
if
new_key
:
old_keys
.
append
(
key
)
new_keys
.
append
(
new_key
)
for
old_key
,
new_key
in
zip
(
old_keys
,
new_keys
):
state_dict
[
new_key
]
=
state_dict
.
pop
(
old_key
)
missing_keys
=
[]
unexpected_keys
=
[]
error_msgs
=
[]
# copy state_dict so _load_from_state_dict can modify it
metadata
=
getattr
(
state_dict
,
'_metadata'
,
None
)
state_dict
=
state_dict
.
copy
()
if
metadata
is
not
None
:
state_dict
.
_metadata
=
metadata
def
load
(
module
,
prefix
=
''
):
local_metadata
=
{}
if
metadata
is
None
else
metadata
.
get
(
prefix
[:
-
1
],
{})
module
.
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
True
,
missing_keys
,
unexpected_keys
,
error_msgs
)
for
name
,
child
in
module
.
_modules
.
items
():
if
child
is
not
None
:
load
(
child
,
prefix
+
name
+
'.'
)
start_prefix
=
''
if
not
hasattr
(
model
,
'bert'
)
and
any
(
s
.
startswith
(
'bert.'
)
for
s
in
state_dict
.
keys
()):
start_prefix
=
'bert.'
load
(
model
,
prefix
=
start_prefix
)
if
len
(
missing_keys
)
>
0
:
logger
.
info
(
"Weights of {} not initialized from pretrained model: {}"
.
format
(
model
.
__class__
.
__name__
,
missing_keys
))
if
len
(
unexpected_keys
)
>
0
:
logger
.
info
(
"Weights from pretrained model not used in {}: {}"
.
format
(
model
.
__class__
.
__name__
,
unexpected_keys
))
if
len
(
error_msgs
)
>
0
:
raise
RuntimeError
(
'Error(s) in loading state_dict for {}:
\n\t
{}'
.
format
(
model
.
__class__
.
__name__
,
"
\n\t
"
.
join
(
error_msgs
)))
return
model
class
BertModel
(
BertPreTrainedModel
):
"""BERT model ("Bidirectional Embedding Representations from a Transformer").
Params:
config: a BertConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.BertModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
args
=
None
):
super
(
BertModel
,
self
).
__init__
(
config
)
self
.
embeddings
=
BertEmbeddings
(
config
)
# set pad_token_id that is used for sparse attention padding
self
.
pad_token_id
=
config
.
pad_token_id
if
hasattr
(
config
,
'pad_token_id'
)
and
config
.
pad_token_id
is
not
None
else
0
# set sparse_attention_config if it has been selected
self
.
sparse_attention_config
=
get_sparse_attention_config
(
args
,
config
.
num_attention_heads
)
self
.
sparse_attention_utils
=
get_sparse_attention_utils
(
self
.
sparse_attention_config
)
self
.
encoder
=
BertEncoder
(
config
,
args
,
sparse_attention_config
=
self
.
sparse_attention_config
)
self
.
pooler
=
BertPooler
(
config
)
self
.
apply
(
self
.
init_bert_weights
)
logger
.
info
(
"Init BERT pretrain model"
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
output_all_encoded_layers
=
True
,
checkpoint_activations
=
False
,
progressive_layer_drop
=
False
,
theta
=
0.5
):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask
=
extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
# If BertEncoder uses sparse attention, it needs to be padded based on the sparse attention block size
if
self
.
sparse_attention_config
is
not
None
:
pad_len
,
input_ids
,
attention_mask
,
token_type_ids
,
position_ids
,
inputs_embeds
=
self
.
sparse_attention_utils
.
pad_to_block_size
(
block_size
=
self
.
sparse_attention_config
.
block
,
input_ids
=
input_ids
,
attention_mask
=
extended_attention_mask
,
token_type_ids
=
token_type_ids
,
position_ids
=
None
,
inputs_embeds
=
None
,
pad_token_id
=
self
.
pad_token_id
,
model_mbeddings
=
self
.
embeddings
)
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
encoded_layers
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
,
output_all_encoded_layers
=
output_all_encoded_layers
,
checkpoint_activations
=
checkpoint_activations
,
progressive_layer_drop
=
progressive_layer_drop
,
theta
=
theta
)
sequence_output
=
encoded_layers
[
-
1
]
pooled_output
=
self
.
pooler
(
sequence_output
)
# If BertEncoder uses sparse attention, and input_ids were padded, sequence output needs to be unpadded to original length
if
self
.
sparse_attention_config
is
not
None
and
pad_len
>
0
:
encoded_layers
[
-
1
]
=
self
.
sparse_attention_utils
.
unpad_sequence_output
(
pad_len
,
encoded_layers
[
-
1
])
if
not
output_all_encoded_layers
:
encoded_layers
=
encoded_layers
[
-
1
]
return
encoded_layers
,
pooled_output
class
BertForPreTrainingPreLN
(
BertPreTrainedModel
):
"""BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads:
- the masked language modeling head, and
- the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `masked_lm_labels` and `next_sentence_label` are not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `masked_lm_labels` or `next_sentence_label` is `None`:
Outputs a tuple comprising
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForPreTraining(config)
masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
args
):
super
(
BertForPreTrainingPreLN
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
,
args
)
self
.
cls
=
BertPreTrainingHeads
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
apply
(
self
.
init_bert_weights
)
self
.
args
=
args
def
forward
(
self
,
batch
,
**
kwargs
):
progressive_layer_drop
=
kwargs
.
get
(
'progressive_layer_drop'
,
False
)
theta
=
kwargs
.
get
(
'pld_theta'
,
1.0
)
input_ids
=
batch
[
1
]
token_type_ids
=
batch
[
3
]
attention_mask
=
batch
[
2
]
masked_lm_labels
=
batch
[
5
]
next_sentence_label
=
batch
[
4
]
checkpoint_activations
=
False
sequence_output
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
,
checkpoint_activations
=
checkpoint_activations
,
progressive_layer_drop
=
progressive_layer_drop
,
theta
=
theta
)
if
masked_lm_labels
is
not
None
and
next_sentence_label
is
not
None
:
# filter out all masked labels.
masked_token_indexes
=
torch
.
nonzero
(
(
masked_lm_labels
+
1
).
view
(
-
1
)).
view
(
-
1
)
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
,
masked_token_indexes
)
target
=
torch
.
index_select
(
masked_lm_labels
.
view
(
-
1
),
0
,
masked_token_indexes
)
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
target
)
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
total_loss
=
masked_lm_loss
+
next_sentence_loss
return
total_loss
else
:
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
)
return
prediction_scores
,
seq_relationship_score
class
BertForMaskedLM
(
BertPreTrainedModel
):
"""BERT model with the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
Outputs:
if `masked_lm_labels` is not `None`:
Outputs the masked language modeling loss.
if `masked_lm_labels` is `None`:
Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForMaskedLM(config)
masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForMaskedLM
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertOnlyMLMHead
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
checkpoint_activations
=
False
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
prediction_scores
=
self
.
cls
(
sequence_output
)
if
masked_lm_labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
return
masked_lm_loss
else
:
return
prediction_scores
class
BertForNextSentencePrediction
(
BertPreTrainedModel
):
"""BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `next_sentence_label` is not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `next_sentence_label` is `None`:
Outputs the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForNextSentencePrediction(config)
seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForNextSentencePrediction
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertOnlyNSPHead
(
config
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
next_sentence_label
=
None
,
checkpoint_activations
=
False
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
seq_relationship_score
=
self
.
cls
(
pooled_output
)
if
next_sentence_label
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
return
next_sentence_loss
else
:
return
seq_relationship_score
class
BertForSequenceClassification
(
BertPreTrainedModel
):
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForSequenceClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
args
,
config
,
num_labels
):
super
(
BertForSequenceClassification
,
self
).
__init__
(
config
)
self
.
num_labels
=
num_labels
self
.
bert
=
BertModel
(
config
,
args
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
checkpoint_activations
=
False
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
return
loss
else
:
return
logits
class
BertForMultipleChoice
(
BertPreTrainedModel
):
"""BERT model for multiple choice tasks.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_choices`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_choices = 2
model = BertForMultipleChoice(config, num_choices)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
num_choices
):
super
(
BertForMultipleChoice
,
self
).
__init__
(
config
)
self
.
num_choices
=
num_choices
self
.
bert
=
BertModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
1
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
checkpoint_activations
=
False
):
flat_input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
flat_token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
flat_attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
_
,
pooled_output
=
self
.
bert
(
flat_input_ids
,
flat_token_type_ids
,
flat_attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
reshaped_logits
=
logits
.
view
(
-
1
,
self
.
num_choices
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
reshaped_logits
,
labels
)
return
loss
else
:
return
reshaped_logits
class
BertForTokenClassification
(
BertPreTrainedModel
):
"""BERT model for token-level classification.
This module is composed of the BERT model with a linear layer on top of
the full hidden state of the last layer.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForTokenClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
num_labels
):
super
(
BertForTokenClassification
,
self
).
__init__
(
config
)
self
.
num_labels
=
num_labels
self
.
bert
=
BertModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
,
checkpoint_activations
=
False
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
sequence_output
=
self
.
dropout
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
# Only keep active parts of the loss
if
attention_mask
is
not
None
:
active_loss
=
attention_mask
.
view
(
-
1
)
==
1
active_logits
=
logits
.
view
(
-
1
,
self
.
num_labels
)[
active_loss
]
active_labels
=
labels
.
view
(
-
1
)[
active_loss
]
loss
=
loss_fct
(
active_logits
,
active_labels
)
else
:
loss
=
loss_fct
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
return
loss
else
:
return
logits
class
BertForQuestionAnswering
(
BertPreTrainedModel
):
"""BERT model for Question Answering (span extraction).
This module is composed of the BERT model with a linear layer on top of
the sequence output that computes start_logits and end_logits
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
Outputs:
if `start_positions` and `end_positions` are not `None`:
Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
if `start_positions` or `end_positions` is `None`:
Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
position tokens of shape [batch_size, sequence_length].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForQuestionAnswering(config)
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForQuestionAnswering
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
# TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
self
.
qa_outputs
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
,
checkpoint_activations
=
False
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
logits
=
self
.
qa_outputs
(
sequence_output
)
start_logits
,
end_logits
=
logits
.
split
(
1
,
dim
=-
1
)
start_logits
=
start_logits
.
squeeze
(
-
1
)
end_logits
=
end_logits
.
squeeze
(
-
1
)
if
start_positions
is
not
None
and
end_positions
is
not
None
:
# If we are on multi-GPU, split add a dimension
if
len
(
start_positions
.
size
())
>
1
:
start_positions
=
start_positions
.
squeeze
(
-
1
)
if
len
(
end_positions
.
size
())
>
1
:
end_positions
=
end_positions
.
squeeze
(
-
1
)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index
=
start_logits
.
size
(
1
)
start_positions
.
clamp_
(
0
,
ignored_index
)
end_positions
.
clamp_
(
0
,
ignored_index
)
loss_fct
=
CrossEntropyLoss
(
ignore_index
=
ignored_index
)
start_loss
=
loss_fct
(
start_logits
,
start_positions
)
end_loss
=
loss_fct
(
end_logits
,
end_positions
)
total_loss
=
(
start_loss
+
end_loss
)
/
2
return
total_loss
else
:
return
start_logits
,
end_logits
Deepspeed/BingBertGlue/nvidia_bert_dataset_provider.py
0 → 100644
View file @
316d3f90
import
os
import
random
import
h5py
import
logging
import
json
from
concurrent.futures
import
ProcessPoolExecutor
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
from
torch.utils.data
import
DataLoader
,
Dataset
from
torch.utils.data.sampler
import
RandomSampler
from
torch.utils.data.distributed
import
DistributedSampler
from
bert_dataset_provider
import
BertDatasetProviderInterface
from
turing.dataset
import
BatchType
,
map_to_torch
# Workaround because python functions are not picklable
class
WorkerInitObj
(
object
):
def
__init__
(
self
,
seed
):
self
.
seed
=
seed
def
__call__
(
self
,
id
):
np
.
random
.
seed
(
seed
=
self
.
seed
+
id
)
random
.
seed
(
self
.
seed
+
id
)
def
create_pretraining_dataset
(
input_file
,
max_predictions_per_seq
,
num_workers
,
train_batch_size
,
worker_init
,
data_sampler
):
train_data
=
pretraining_dataset
(
input_file
=
input_file
,
max_predictions_per_seq
=
max_predictions_per_seq
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
data_sampler
(
train_data
),
batch_size
=
train_batch_size
,
num_workers
=
num_workers
,
worker_init_fn
=
worker_init
,
pin_memory
=
True
)
return
train_dataloader
,
len
(
train_data
)
class
pretraining_dataset
(
Dataset
):
def
__init__
(
self
,
input_file
,
max_predictions_per_seq
):
self
.
input_file
=
input_file
self
.
max_predictions_per_seq
=
max_predictions_per_seq
f
=
h5py
.
File
(
input_file
,
"r"
)
keys
=
[
'input_ids'
,
'input_mask'
,
'segment_ids'
,
'masked_lm_positions'
,
'masked_lm_ids'
,
'next_sentence_labels'
]
self
.
inputs
=
[
np
.
asarray
(
f
[
key
][:])
for
key
in
keys
]
f
.
close
()
def
__len__
(
self
):
'Denotes the total number of samples'
return
len
(
self
.
inputs
[
0
])
def
__getitem__
(
self
,
index
):
[
input_ids
,
input_mask
,
segment_ids
,
masked_lm_positions
,
masked_lm_ids
,
next_sentence_labels
]
=
[
torch
.
from_numpy
(
input
[
index
].
astype
(
np
.
int64
))
if
indice
<
5
else
torch
.
from_numpy
(
np
.
asarray
(
input
[
index
].
astype
(
np
.
int64
)))
for
indice
,
input
in
enumerate
(
self
.
inputs
)
]
masked_lm_labels
=
torch
.
ones
(
input_ids
.
shape
,
dtype
=
torch
.
long
)
*
-
1
index
=
self
.
max_predictions_per_seq
# store number of masked tokens in index
padded_mask_indices
=
(
masked_lm_positions
==
0
).
nonzero
()
if
len
(
padded_mask_indices
)
!=
0
:
index
=
padded_mask_indices
[
0
].
item
()
masked_lm_labels
[
masked_lm_positions
[:
index
]]
=
masked_lm_ids
[:
index
]
return
[
map_to_torch
([
BatchType
.
PRETRAIN_BATCH
]),
input_ids
,
input_mask
,
segment_ids
,
next_sentence_labels
,
masked_lm_labels
]
class
NvidiaBertDatasetProvider
(
BertDatasetProviderInterface
):
def
__init__
(
self
,
args
):
self
.
num_workers
=
args
.
config
[
'training'
][
'num_workers'
]
self
.
max_seq_length
=
args
.
max_seq_length
self
.
max_predictions_per_seq
=
args
.
max_predictions_per_seq
self
.
gradient_accumulation_steps
=
args
.
gradient_accumulation_steps
self
.
train_micro_batch_size_per_gpu
=
args
.
train_micro_batch_size_per_gpu
self
.
logger
=
args
.
logger
if
args
.
local_rank
==
-
1
:
self
.
global_rank
=
0
self
.
world_size
=
1
else
:
self
.
global_rank
=
dist
.
get_rank
()
self
.
world_size
=
dist
.
get_world_size
()
# Initialize dataset files
dataset_path
=
os
.
path
.
join
(
args
.
data_path_prefix
,
args
.
config
[
'data'
][
'datasets'
][
'pretrain_dataset'
])
self
.
dataset_files
=
[
os
.
path
.
join
(
dataset_path
,
f
)
for
f
in
os
.
listdir
(
dataset_path
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
dataset_path
,
f
))
and
'training'
in
f
]
self
.
dataset_files
.
sort
()
random
.
shuffle
(
self
.
dataset_files
)
self
.
num_files
=
len
(
self
.
dataset_files
)
self
.
data_sampler
=
RandomSampler
self
.
worker_init
=
WorkerInitObj
(
args
.
seed
+
args
.
local_rank
)
self
.
dataset_future
=
None
self
.
pool
=
ProcessPoolExecutor
(
1
)
if
self
.
global_rank
==
0
:
self
.
logger
.
info
(
f
"NvidiaBertDatasetProvider - Initialization: num_files =
{
self
.
num_files
}
"
)
def
get_shard
(
self
,
index
):
if
self
.
dataset_future
is
None
:
data_file
=
self
.
_get_shard_file
(
index
)
self
.
train_dataloader
,
sample_count
=
create_pretraining_dataset
(
input_file
=
data_file
,
max_predictions_per_seq
=
self
.
max_predictions_per_seq
,
num_workers
=
self
.
num_workers
,
train_batch_size
=
self
.
train_micro_batch_size_per_gpu
,
worker_init
=
self
.
worker_init
,
data_sampler
=
self
.
data_sampler
)
else
:
self
.
train_dataloader
,
sample_count
=
self
.
dataset_future
.
result
(
timeout
=
None
)
return
self
.
train_dataloader
,
sample_count
def
release_shard
(
self
,
index
):
del
self
.
train_dataloader
def
prefetch_shard
(
self
,
index
):
data_file
=
self
.
_get_shard_file
(
index
)
self
.
dataset_future
=
self
.
pool
.
submit
(
create_pretraining_dataset
,
data_file
,
self
.
max_predictions_per_seq
,
self
.
num_workers
,
self
.
train_micro_batch_size_per_gpu
,
self
.
worker_init
,
self
.
data_sampler
)
def
get_batch
(
self
,
batch_iter
):
return
batch_iter
def
prefetch_batch
(
self
):
pass
def
_get_shard_file
(
self
,
shard_index
):
file_index
=
self
.
_get_shard_file_index
(
shard_index
,
self
.
global_rank
)
return
self
.
dataset_files
[
file_index
%
self
.
num_files
]
def
_get_shard_file_index
(
self
,
shard_index
,
global_rank
):
if
dist
.
is_initialized
()
and
self
.
world_size
>
self
.
num_files
:
remainder
=
self
.
world_size
%
self
.
num_files
file_index
=
(
shard_index
*
self
.
world_size
)
+
global_rank
+
(
remainder
*
shard_index
)
else
:
file_index
=
shard_index
*
self
.
world_size
+
global_rank
return
file_index
%
self
.
num_files
Deepspeed/BingBertGlue/pytorch_pretrained_bert/__init__.py
0 → 100644
View file @
316d3f90
__version__
=
"0.4.0"
from
.tokenization
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
from
.modeling
import
(
BertConfig
,
BertModel
,
BertForPreTraining
,
BertForMaskedLM
,
BertForNextSentencePrediction
,
BertForSequenceClassification
,
BertForMultipleChoice
,
BertForTokenClassification
,
BertForQuestionAnswering
)
from
.optimization
import
BertAdam
from
.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
Deepspeed/BingBertGlue/pytorch_pretrained_bert/__main__.py
0 → 100644
View file @
316d3f90
# coding: utf8
def
main
():
import
sys
try
:
from
.convert_tf_checkpoint_to_pytorch
import
convert_tf_checkpoint_to_pytorch
except
ModuleNotFoundError
:
print
(
"pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
if
len
(
sys
.
argv
)
!=
5
:
# pylint: disable=line-too-long
print
(
"Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`"
)
else
:
PYTORCH_DUMP_OUTPUT
=
sys
.
argv
.
pop
()
TF_CONFIG
=
sys
.
argv
.
pop
()
TF_CHECKPOINT
=
sys
.
argv
.
pop
()
convert_tf_checkpoint_to_pytorch
(
TF_CHECKPOINT
,
TF_CONFIG
,
PYTORCH_DUMP_OUTPUT
)
if
__name__
==
'__main__'
:
main
()
Deepspeed/BingBertGlue/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright 2018 The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert BERT checkpoint."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
re
import
argparse
import
tensorflow
as
tf
import
torch
import
numpy
as
np
from
.modeling
import
BertConfig
,
BertForPreTraining
def
convert_tf_checkpoint_to_pytorch
(
tf_checkpoint_path
,
bert_config_file
,
pytorch_dump_path
):
config_path
=
os
.
path
.
abspath
(
bert_config_file
)
tf_path
=
os
.
path
.
abspath
(
tf_checkpoint_path
)
print
(
"Converting TensorFlow checkpoint from {} with config at {}"
.
format
(
tf_path
,
config_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
print
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
# Initialise PyTorch model
config
=
BertConfig
.
from_json_file
(
bert_config_file
)
print
(
"Building PyTorch model from configuration: {}"
.
format
(
str
(
config
)))
model
=
BertForPreTraining
(
config
)
for
name
,
array
in
zip
(
names
,
arrays
):
name
=
name
.
split
(
'/'
)
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if
any
(
n
in
[
"adam_v"
,
"adam_m"
,
"global_step"
]
for
n
in
name
):
print
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+_\d+'
,
m_name
):
l
=
re
.
split
(
r
'_(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
if
l
[
0
]
==
'kernel'
or
l
[
0
]
==
'gamma'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'output_bias'
or
l
[
0
]
==
'beta'
:
pointer
=
getattr
(
pointer
,
'bias'
)
elif
l
[
0
]
==
'output_weights'
:
pointer
=
getattr
(
pointer
,
'weight'
)
else
:
pointer
=
getattr
(
pointer
,
l
[
0
])
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
if
m_name
[
-
11
:]
==
'_embeddings'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
m_name
==
'kernel'
:
array
=
np
.
transpose
(
array
)
try
:
assert
pointer
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
array
.
shape
)
raise
print
(
"Initialize PyTorch weight {}"
.
format
(
name
))
pointer
.
data
=
torch
.
from_numpy
(
array
)
# Save pytorch-model
print
(
"Save PyTorch model to {}"
.
format
(
pytorch_dump_path
))
torch
.
save
(
model
.
state_dict
(),
pytorch_dump_path
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--tf_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path the TensorFlow checkpoint path."
)
parser
.
add_argument
(
"--bert_config_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The config json file corresponding to the pre-trained BERT model.
\n
"
"This specifies the model architecture."
)
parser
.
add_argument
(
"--pytorch_dump_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the output PyTorch model."
)
args
=
parser
.
parse_args
()
convert_tf_checkpoint_to_pytorch
(
args
.
tf_checkpoint_path
,
args
.
bert_config_file
,
args
.
pytorch_dump_path
)
Deepspeed/BingBertGlue/pytorch_pretrained_bert/file_utils.py
0 → 100644
View file @
316d3f90
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
import
os
import
logging
import
shutil
import
tempfile
import
json
from
urllib.parse
import
urlparse
from
pathlib
import
Path
from
typing
import
Optional
,
Tuple
,
Union
,
IO
,
Callable
,
Set
from
hashlib
import
sha256
from
functools
import
wraps
from
tqdm
import
tqdm
import
boto3
from
botocore.exceptions
import
ClientError
import
requests
logger
=
logging
.
getLogger
(
__name__
)
# pylint: disable=invalid-name
PYTORCH_PRETRAINED_BERT_CACHE
=
Path
(
os
.
getenv
(
'PYTORCH_PRETRAINED_BERT_CACHE'
,
Path
.
home
()
/
'.pytorch_pretrained_bert'
))
def
url_to_filename
(
url
:
str
,
etag
:
str
=
None
)
->
str
:
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
"""
url_bytes
=
url
.
encode
(
'utf-8'
)
url_hash
=
sha256
(
url_bytes
)
filename
=
url_hash
.
hexdigest
()
if
etag
:
etag_bytes
=
etag
.
encode
(
'utf-8'
)
etag_hash
=
sha256
(
etag_bytes
)
filename
+=
'.'
+
etag_hash
.
hexdigest
()
return
filename
def
filename_to_url
(
filename
:
str
,
cache_dir
:
Union
[
str
,
Path
]
=
None
)
->
Tuple
[
str
,
str
]:
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist.
"""
if
cache_dir
is
None
:
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
if
isinstance
(
cache_dir
,
Path
):
cache_dir
=
str
(
cache_dir
)
cache_path
=
os
.
path
.
join
(
cache_dir
,
filename
)
if
not
os
.
path
.
exists
(
cache_path
):
raise
FileNotFoundError
(
"file {} not found"
.
format
(
cache_path
))
meta_path
=
cache_path
+
'.json'
if
not
os
.
path
.
exists
(
meta_path
):
raise
FileNotFoundError
(
"file {} not found"
.
format
(
meta_path
))
with
open
(
meta_path
)
as
meta_file
:
metadata
=
json
.
load
(
meta_file
)
url
=
metadata
[
'url'
]
etag
=
metadata
[
'etag'
]
return
url
,
etag
def
cached_path
(
url_or_filename
:
Union
[
str
,
Path
],
cache_dir
:
Union
[
str
,
Path
]
=
None
)
->
str
:
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
if
cache_dir
is
None
:
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
if
isinstance
(
url_or_filename
,
Path
):
url_or_filename
=
str
(
url_or_filename
)
if
isinstance
(
cache_dir
,
Path
):
cache_dir
=
str
(
cache_dir
)
parsed
=
urlparse
(
url_or_filename
)
if
parsed
.
scheme
in
(
'http'
,
'https'
,
's3'
):
# URL, so get it from the cache (downloading if necessary)
return
get_from_cache
(
url_or_filename
,
cache_dir
)
elif
os
.
path
.
exists
(
url_or_filename
):
# File, and it exists.
return
url_or_filename
elif
parsed
.
scheme
==
''
:
# File, but it doesn't exist.
raise
FileNotFoundError
(
"file {} not found"
.
format
(
url_or_filename
))
else
:
# Something unknown
raise
ValueError
(
"unable to parse {} as a URL or as a local path"
.
format
(
url_or_filename
))
def
split_s3_path
(
url
:
str
)
->
Tuple
[
str
,
str
]:
"""Split a full s3 path into the bucket name and path."""
parsed
=
urlparse
(
url
)
if
not
parsed
.
netloc
or
not
parsed
.
path
:
raise
ValueError
(
"bad s3 path {}"
.
format
(
url
))
bucket_name
=
parsed
.
netloc
s3_path
=
parsed
.
path
# Remove '/' at beginning of path.
if
s3_path
.
startswith
(
"/"
):
s3_path
=
s3_path
[
1
:]
return
bucket_name
,
s3_path
def
s3_request
(
func
:
Callable
):
"""
Wrapper function for s3 requests in order to create more helpful error
messages.
"""
@
wraps
(
func
)
def
wrapper
(
url
:
str
,
*
args
,
**
kwargs
):
try
:
return
func
(
url
,
*
args
,
**
kwargs
)
except
ClientError
as
exc
:
if
int
(
exc
.
response
[
"Error"
][
"Code"
])
==
404
:
raise
FileNotFoundError
(
"file {} not found"
.
format
(
url
))
else
:
raise
return
wrapper
@
s3_request
def
s3_etag
(
url
:
str
)
->
Optional
[
str
]:
"""Check ETag on S3 object."""
s3_resource
=
boto3
.
resource
(
"s3"
)
bucket_name
,
s3_path
=
split_s3_path
(
url
)
s3_object
=
s3_resource
.
Object
(
bucket_name
,
s3_path
)
return
s3_object
.
e_tag
@
s3_request
def
s3_get
(
url
:
str
,
temp_file
:
IO
)
->
None
:
"""Pull a file directly from S3."""
s3_resource
=
boto3
.
resource
(
"s3"
)
bucket_name
,
s3_path
=
split_s3_path
(
url
)
s3_resource
.
Bucket
(
bucket_name
).
download_fileobj
(
s3_path
,
temp_file
)
def
http_get
(
url
:
str
,
temp_file
:
IO
)
->
None
:
req
=
requests
.
get
(
url
,
stream
=
True
)
content_length
=
req
.
headers
.
get
(
'Content-Length'
)
total
=
int
(
content_length
)
if
content_length
is
not
None
else
None
progress
=
tqdm
(
unit
=
"B"
,
total
=
total
)
for
chunk
in
req
.
iter_content
(
chunk_size
=
1024
):
if
chunk
:
# filter out keep-alive new chunks
progress
.
update
(
len
(
chunk
))
temp_file
.
write
(
chunk
)
progress
.
close
()
def
get_from_cache
(
url
:
str
,
cache_dir
:
Union
[
str
,
Path
]
=
None
)
->
str
:
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
if
cache_dir
is
None
:
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
if
isinstance
(
cache_dir
,
Path
):
cache_dir
=
str
(
cache_dir
)
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
# Get eTag to add to filename, if it exists.
if
url
.
startswith
(
"s3://"
):
etag
=
s3_etag
(
url
)
else
:
response
=
requests
.
head
(
url
,
allow_redirects
=
True
)
if
response
.
status_code
!=
200
:
raise
IOError
(
"HEAD request failed for url {} with status code {}"
.
format
(
url
,
response
.
status_code
))
etag
=
response
.
headers
.
get
(
"ETag"
)
filename
=
url_to_filename
(
url
,
etag
)
# get cache path to put the file
cache_path
=
os
.
path
.
join
(
cache_dir
,
filename
)
if
not
os
.
path
.
exists
(
cache_path
):
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with
tempfile
.
NamedTemporaryFile
()
as
temp_file
:
logger
.
info
(
"%s not found in cache, downloading to %s"
,
url
,
temp_file
.
name
)
# GET file object
if
url
.
startswith
(
"s3://"
):
s3_get
(
url
,
temp_file
)
else
:
http_get
(
url
,
temp_file
)
# we are copying the file before closing it, so flush to avoid truncation
temp_file
.
flush
()
# shutil.copyfileobj() starts at the current position, so go to the start
temp_file
.
seek
(
0
)
logger
.
info
(
"copying %s to cache at %s"
,
temp_file
.
name
,
cache_path
)
with
open
(
cache_path
,
'wb'
)
as
cache_file
:
shutil
.
copyfileobj
(
temp_file
,
cache_file
)
logger
.
info
(
"creating metadata file for %s"
,
cache_path
)
meta
=
{
'url'
:
url
,
'etag'
:
etag
}
meta_path
=
cache_path
+
'.json'
with
open
(
meta_path
,
'w'
)
as
meta_file
:
json
.
dump
(
meta
,
meta_file
)
logger
.
info
(
"removing temp file %s"
,
temp_file
.
name
)
return
cache_path
def
read_set_from_file
(
filename
:
str
)
->
Set
[
str
]:
'''
Extract a de-duped collection (set) of text from a file.
Expected file format is one item per line.
'''
collection
=
set
()
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
file_
:
for
line
in
file_
:
collection
.
add
(
line
.
rstrip
())
return
collection
def
get_file_extension
(
path
:
str
,
dot
=
True
,
lower
:
bool
=
True
):
ext
=
os
.
path
.
splitext
(
path
)[
1
]
ext
=
ext
if
dot
else
ext
[
1
:]
return
ext
.
lower
()
if
lower
else
ext
Deepspeed/BingBertGlue/pytorch_pretrained_bert/modeling.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch BERT model."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
copy
import
json
import
math
import
logging
import
tarfile
import
tempfile
import
shutil
import
torch
from
torch
import
nn
from
torch.nn
import
CrossEntropyLoss
from
.file_utils
import
cached_path
logger
=
logging
.
getLogger
(
__name__
)
PRETRAINED_MODEL_ARCHIVE_MAP
=
{
'bert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz"
,
'bert-large-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz"
,
'bert-base-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz"
,
'bert-large-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz"
,
'bert-base-multilingual-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz"
,
'bert-base-multilingual-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz"
,
'bert-base-chinese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz"
,
}
CONFIG_NAME
=
'bert_config.json'
WEIGHTS_NAME
=
'pytorch_model.bin'
def
gelu
(
x
):
"""Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
"""
pdtype
=
x
.
dtype
x
=
x
.
float
()
y
=
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
return
y
.
to
(
pdtype
)
def
swish
(
x
):
return
x
*
torch
.
sigmoid
(
x
)
ACT2FN
=
{
"gelu"
:
gelu
,
"relu"
:
torch
.
nn
.
functional
.
relu
,
"swish"
:
swish
}
class
BertConfig
(
object
):
"""Configuration class to store the configuration of a `BertModel`.
"""
def
__init__
(
self
,
vocab_size_or_config_json_file
,
hidden_size
=
768
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
intermediate_size
=
3072
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
2
,
initializer_range
=
0.02
):
"""Constructs BertConfig.
Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
if
isinstance
(
vocab_size_or_config_json_file
,
str
):
with
open
(
vocab_size_or_config_json_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
json_config
=
json
.
loads
(
reader
.
read
())
for
key
,
value
in
json_config
.
items
():
self
.
__dict__
[
key
]
=
value
elif
isinstance
(
vocab_size_or_config_json_file
,
int
):
self
.
vocab_size
=
vocab_size_or_config_json_file
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
hidden_act
=
hidden_act
self
.
intermediate_size
=
intermediate_size
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
initializer_range
=
initializer_range
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
@
classmethod
def
from_dict
(
cls
,
json_object
):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
config
=
BertConfig
(
vocab_size_or_config_json_file
=-
1
)
for
key
,
value
in
json_object
.
items
():
config
.
__dict__
[
key
]
=
value
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
config
)
return
config
@
classmethod
def
from_json_file
(
cls
,
json_file
):
"""Constructs a `BertConfig` from a json file of parameters."""
with
open
(
json_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
text
=
reader
.
read
()
return
cls
.
from_dict
(
json
.
loads
(
text
))
def
__repr__
(
self
):
return
str
(
self
.
to_json_string
())
def
to_dict
(
self
):
"""Serializes this instance to a Python dictionary."""
output
=
copy
.
deepcopy
(
self
.
__dict__
)
return
output
def
to_json_string
(
self
):
"""Serializes this instance to a JSON string."""
return
json
.
dumps
(
self
.
to_dict
(),
indent
=
2
,
sort_keys
=
True
)
+
"
\n
"
#try:
# from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
#except ImportError:
#print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
class
BertLayerNorm
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
,
eps
=
1e-12
):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super
(
BertLayerNorm
,
self
).
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
hidden_size
))
self
.
variance_epsilon
=
eps
def
forward
(
self
,
x
):
pdtype
=
x
.
dtype
x
=
x
.
float
()
u
=
x
.
mean
(
-
1
,
keepdim
=
True
)
s
=
(
x
-
u
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
x
=
(
x
-
u
)
/
torch
.
sqrt
(
s
+
self
.
variance_epsilon
)
return
self
.
weight
*
x
.
to
(
pdtype
)
+
self
.
bias
class
BertEmbeddings
(
nn
.
Module
):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def
__init__
(
self
,
config
):
super
(
BertEmbeddings
,
self
).
__init__
()
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
type_vocab_size
,
config
.
hidden_size
)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
seq_length
=
input_ids
.
size
(
1
)
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
token_type_embeddings
=
self
.
token_type_embeddings
(
token_type_ids
)
embeddings
=
words_embeddings
+
position_embeddings
+
token_type_embeddings
embeddings
=
self
.
LayerNorm
(
embeddings
)
embeddings
=
self
.
dropout
(
embeddings
)
return
embeddings
class
BertSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertSelfAttention
,
self
).
__init__
()
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
config
.
hidden_size
,
config
.
num_attention_heads
))
self
.
num_attention_heads
=
config
.
num_attention_heads
self
.
attention_head_size
=
int
(
config
.
hidden_size
/
config
.
num_attention_heads
)
self
.
all_head_size
=
self
.
num_attention_heads
*
self
.
attention_head_size
self
.
query
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
key
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
value
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
def
transpose_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
hidden_states
,
attention_mask
):
mixed_query_layer
=
self
.
query
(
hidden_states
)
mixed_key_layer
=
self
.
key
(
hidden_states
)
mixed_value_layer
=
self
.
value
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
mixed_query_layer
)
key_layer
=
self
.
transpose_for_scores
(
mixed_key_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
.
transpose
(
-
1
,
-
2
))
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores
=
attention_scores
+
attention_mask
pdtype
=
attention_scores
.
dtype
# Normalize the attention scores to probabilities.
attention_probs
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
.
float
()).
to
(
pdtype
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,
)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
return
context_layer
class
BertSelfOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertSelfOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dense
.
bert_output_layer
=
True
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
return
hidden_states
class
BertAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertAttention
,
self
).
__init__
()
self
.
self
=
BertSelfAttention
(
config
)
self
.
output
=
BertSelfOutput
(
config
)
def
forward
(
self
,
input_tensor
,
attention_mask
):
self_output
=
self
.
self
(
input_tensor
,
attention_mask
)
attention_output
=
self
.
output
(
self_output
,
input_tensor
)
return
attention_output
class
BertIntermediate
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertIntermediate
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
intermediate_size
)
self
.
intermediate_act_fn
=
ACT2FN
[
config
.
hidden_act
]
\
if
isinstance
(
config
.
hidden_act
,
str
)
else
config
.
hidden_act
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
intermediate_act_fn
(
hidden_states
)
return
hidden_states
class
BertOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
self
.
dense
.
bert_output_layer
=
True
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
return
hidden_states
class
BertLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertLayer
,
self
).
__init__
()
self
.
attention
=
BertAttention
(
config
)
self
.
PreAttentionLayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
PostAttentionLayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
intermediate
=
BertIntermediate
(
config
)
self
.
output
=
BertOutput
(
config
)
def
forward
(
self
,
hidden_states
,
attention_mask
):
input_layer_norm
=
self
.
PreAttentionLayerNorm
(
hidden_states
)
attention_output
=
self
.
attention
(
input_layer_norm
,
attention_mask
)
intermediate_input
=
hidden_states
+
attention_output
intermediate_layer_norm
=
self
.
PostAttentionLayerNorm
(
intermediate_input
)
intermediate_output
=
self
.
intermediate
(
intermediate_layer_norm
)
layer_output
=
self
.
output
(
intermediate_output
)
return
layer_output
+
intermediate_input
class
BertEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertEncoder
,
self
).
__init__
()
#Added later to make it similar to GPT-2
self
.
FinalLayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
layer
=
BertLayer
(
config
)
self
.
layer
=
nn
.
ModuleList
(
[
copy
.
deepcopy
(
layer
)
for
_
in
range
(
config
.
num_hidden_layers
)])
def
forward
(
self
,
hidden_states
,
attention_mask
,
output_all_encoded_layers
=
True
):
all_encoder_layers
=
[]
for
layer_module
in
self
.
layer
:
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
)
if
output_all_encoded_layers
:
all_encoder_layers
.
append
(
hidden_states
)
if
not
output_all_encoded_layers
:
hidden_states
=
self
.
FinalLayerNorm
(
hidden_states
)
all_encoder_layers
.
append
(
hidden_states
)
return
all_encoder_layers
class
BertPooler
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertPooler
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
activation
=
nn
.
Tanh
()
def
forward
(
self
,
hidden_states
):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor
=
hidden_states
[:,
0
]
pooled_output
=
self
.
dense
(
first_token_tensor
)
pooled_output
=
self
.
activation
(
pooled_output
)
return
pooled_output
class
BertPredictionHeadTransform
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertPredictionHeadTransform
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
transform_act_fn
=
ACT2FN
[
config
.
hidden_act
]
\
if
isinstance
(
config
.
hidden_act
,
str
)
else
config
.
hidden_act
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
transform_act_fn
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
)
return
hidden_states
class
BertLMPredictionHead
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertLMPredictionHead
,
self
).
__init__
()
self
.
transform
=
BertPredictionHeadTransform
(
config
)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self
.
decoder
=
nn
.
Linear
(
bert_model_embedding_weights
.
size
(
1
),
bert_model_embedding_weights
.
size
(
0
),
bias
=
False
)
self
.
decoder
.
weight
=
bert_model_embedding_weights
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
bert_model_embedding_weights
.
size
(
0
)))
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
transform
(
hidden_states
)
hidden_states
=
self
.
decoder
(
hidden_states
)
+
self
.
bias
return
hidden_states
class
BertOnlyMLMHead
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertOnlyMLMHead
,
self
).
__init__
()
self
.
predictions
=
BertLMPredictionHead
(
config
,
bert_model_embedding_weights
)
def
forward
(
self
,
sequence_output
):
prediction_scores
=
self
.
predictions
(
sequence_output
)
return
prediction_scores
class
BertOnlyNSPHead
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
BertOnlyNSPHead
,
self
).
__init__
()
self
.
seq_relationship
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
def
forward
(
self
,
pooled_output
):
seq_relationship_score
=
self
.
seq_relationship
(
pooled_output
)
return
seq_relationship_score
class
BertPreTrainingHeads
(
nn
.
Module
):
def
__init__
(
self
,
config
,
bert_model_embedding_weights
):
super
(
BertPreTrainingHeads
,
self
).
__init__
()
self
.
predictions
=
BertLMPredictionHead
(
config
,
bert_model_embedding_weights
)
self
.
seq_relationship
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
def
forward
(
self
,
sequence_output
,
pooled_output
):
prediction_scores
=
self
.
predictions
(
sequence_output
)
seq_relationship_score
=
self
.
seq_relationship
(
pooled_output
)
return
prediction_scores
,
seq_relationship_score
class
PreTrainedBertModel
(
nn
.
Module
):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
PreTrainedBertModel
,
self
).
__init__
()
if
not
isinstance
(
config
,
BertConfig
):
raise
ValueError
(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`"
.
format
(
self
.
__class__
.
__name__
,
self
.
__class__
.
__name__
))
self
.
config
=
config
def
init_bert_weights
(
self
,
module
):
""" Initialize the weights.
"""
logger
.
info
(
"Init BERT weights"
)
if
isinstance
(
module
,
(
nn
.
Linear
,
nn
.
Embedding
)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
num_layers
=
self
.
config
.
num_hidden_layers
std
=
self
.
config
.
initializer_range
if
hasattr
(
module
,
'bert_output_layer'
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
"Accounting for accumulation on the residual path"
)
std
=
self
.
config
.
initializer_range
/
math
.
sqrt
(
2.0
*
num_layers
)
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
std
)
elif
isinstance
(
module
,
BertLayerNorm
):
module
.
bias
.
data
.
zero_
()
module
.
weight
.
data
.
fill_
(
1.0
)
if
isinstance
(
module
,
nn
.
Linear
)
and
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name
,
state_dict
=
None
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
if
pretrained_model_name
in
PRETRAINED_MODEL_ARCHIVE_MAP
:
archive_file
=
PRETRAINED_MODEL_ARCHIVE_MAP
[
pretrained_model_name
]
else
:
archive_file
=
pretrained_model_name
# redirect to the cache, if necessary
try
:
resolved_archive_file
=
cached_path
(
archive_file
,
cache_dir
=
cache_dir
)
except
FileNotFoundError
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url."
.
format
(
pretrained_model_name
,
', '
.
join
(
PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
()),
archive_file
))
return
None
if
resolved_archive_file
==
archive_file
:
logger
.
info
(
"loading archive file {}"
.
format
(
archive_file
))
else
:
logger
.
info
(
"loading archive file {} from cache at {}"
.
format
(
archive_file
,
resolved_archive_file
))
tempdir
=
None
if
os
.
path
.
isdir
(
resolved_archive_file
):
serialization_dir
=
resolved_archive_file
else
:
# Extract archive to temp dir
tempdir
=
tempfile
.
mkdtemp
()
logger
.
info
(
"extracting archive file {} to temp dir {}"
.
format
(
resolved_archive_file
,
tempdir
))
with
tarfile
.
open
(
resolved_archive_file
,
'r:gz'
)
as
archive
:
archive
.
extractall
(
tempdir
)
serialization_dir
=
tempdir
# Load config
config_file
=
os
.
path
.
join
(
serialization_dir
,
CONFIG_NAME
)
config
=
BertConfig
.
from_json_file
(
config_file
)
logger
.
info
(
"Model config {}"
.
format
(
config
))
# Instantiate model.
model
=
cls
(
config
,
*
inputs
,
**
kwargs
)
if
state_dict
is
None
:
weights_path
=
os
.
path
.
join
(
serialization_dir
,
WEIGHTS_NAME
)
state_dict
=
torch
.
load
(
weights_path
)
old_keys
=
[]
new_keys
=
[]
for
key
in
state_dict
.
keys
():
new_key
=
None
if
'gamma'
in
key
:
new_key
=
key
.
replace
(
'gamma'
,
'weight'
)
if
'beta'
in
key
:
new_key
=
key
.
replace
(
'beta'
,
'bias'
)
if
new_key
:
old_keys
.
append
(
key
)
new_keys
.
append
(
new_key
)
for
old_key
,
new_key
in
zip
(
old_keys
,
new_keys
):
state_dict
[
new_key
]
=
state_dict
.
pop
(
old_key
)
missing_keys
=
[]
unexpected_keys
=
[]
error_msgs
=
[]
# copy state_dict so _load_from_state_dict can modify it
metadata
=
getattr
(
state_dict
,
'_metadata'
,
None
)
state_dict
=
state_dict
.
copy
()
if
metadata
is
not
None
:
state_dict
.
_metadata
=
metadata
def
load
(
module
,
prefix
=
''
):
local_metadata
=
{}
if
metadata
is
None
else
metadata
.
get
(
prefix
[:
-
1
],
{})
module
.
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
True
,
missing_keys
,
unexpected_keys
,
error_msgs
)
for
name
,
child
in
module
.
_modules
.
items
():
if
child
is
not
None
:
load
(
child
,
prefix
+
name
+
'.'
)
load
(
model
,
prefix
=
''
if
hasattr
(
model
,
'bert'
)
else
'bert.'
)
if
len
(
missing_keys
)
>
0
:
logger
.
info
(
"Weights of {} not initialized from pretrained model: {}"
.
format
(
model
.
__class__
.
__name__
,
missing_keys
))
if
len
(
unexpected_keys
)
>
0
:
logger
.
info
(
"Weights from pretrained model not used in {}: {}"
.
format
(
model
.
__class__
.
__name__
,
unexpected_keys
))
if
tempdir
:
# Clean up temp dir
shutil
.
rmtree
(
tempdir
)
return
model
class
BertModel
(
PreTrainedBertModel
):
"""BERT model ("Bidirectional Embedding Representations from a Transformer").
Params:
config: a BertConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLF`) to train on the Next-Sentence task (see BERT's paper).
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.BertModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertModel
,
self
).
__init__
(
config
)
self
.
embeddings
=
BertEmbeddings
(
config
)
self
.
encoder
=
BertEncoder
(
config
)
self
.
pooler
=
BertPooler
(
config
)
self
.
apply
(
self
.
init_bert_weights
)
logger
.
info
(
"Init BERT pretrain model"
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
output_all_encoded_layers
=
True
):
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask
=
extended_attention_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# fp16 compatibility
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
encoded_layers
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
,
output_all_encoded_layers
=
output_all_encoded_layers
)
sequence_output
=
encoded_layers
[
-
1
]
pooled_output
=
self
.
pooler
(
sequence_output
)
if
not
output_all_encoded_layers
:
encoded_layers
=
encoded_layers
[
-
1
]
return
encoded_layers
,
pooled_output
class
BertForPreTraining
(
PreTrainedBertModel
):
"""BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads:
- the masked language modeling head, and
- the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `masked_lm_labels` and `next_sentence_label` are not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `masked_lm_labels` or `next_sentence_label` is `None`:
Outputs a tuple comprising
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForPreTraining(config)
masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForPreTraining
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertPreTrainingHeads
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
,
next_sentence_label
=
None
):
sequence_output
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
prediction_scores
,
seq_relationship_score
=
self
.
cls
(
sequence_output
,
pooled_output
)
if
masked_lm_labels
is
not
None
and
next_sentence_label
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
total_loss
=
masked_lm_loss
+
next_sentence_loss
return
total_loss
else
:
return
prediction_scores
,
seq_relationship_score
class
BertForMaskedLM
(
PreTrainedBertModel
):
"""BERT model with the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
Outputs:
if `masked_lm_labels` is not `None`:
Outputs the masked language modeling loss.
if `masked_lm_labels` is `None`:
Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForMaskedLM(config)
masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForMaskedLM
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertOnlyMLMHead
(
config
,
self
.
bert
.
embeddings
.
word_embeddings
.
weight
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
masked_lm_labels
=
None
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
prediction_scores
=
self
.
cls
(
sequence_output
)
if
masked_lm_labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
masked_lm_loss
=
loss_fct
(
prediction_scores
.
view
(
-
1
,
self
.
config
.
vocab_size
),
masked_lm_labels
.
view
(
-
1
))
return
masked_lm_loss
else
:
return
prediction_scores
class
BertForNextSentencePrediction
(
PreTrainedBertModel
):
"""BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence classification head.
Params:
config: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
Outputs:
if `next_sentence_label` is not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `next_sentence_label` is `None`:
Outputs the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForNextSentencePrediction(config)
seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForNextSentencePrediction
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
self
.
cls
=
BertOnlyNSPHead
(
config
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
next_sentence_label
=
None
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
seq_relationship_score
=
self
.
cls
(
pooled_output
)
if
next_sentence_label
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
next_sentence_loss
=
loss_fct
(
seq_relationship_score
.
view
(
-
1
,
2
),
next_sentence_label
.
view
(
-
1
))
return
next_sentence_loss
else
:
return
seq_relationship_score
class
BertForSequenceClassification
(
PreTrainedBertModel
):
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForSequenceClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
num_labels
=
2
):
super
(
BertForSequenceClassification
,
self
).
__init__
(
config
)
self
.
num_labels
=
num_labels
self
.
bert
=
BertModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
):
_
,
pooled_output
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
return
loss
else
:
return
logits
class
BertForMultipleChoice
(
PreTrainedBertModel
):
"""BERT model for multiple choice tasks.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_choices`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_choices = 2
model = BertForMultipleChoice(config, num_choices)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
num_choices
=
2
):
super
(
BertForMultipleChoice
,
self
).
__init__
(
config
)
self
.
num_choices
=
num_choices
self
.
bert
=
BertModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
1
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
):
flat_input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
flat_token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
flat_attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
_
,
pooled_output
=
self
.
bert
(
flat_input_ids
,
flat_token_type_ids
,
flat_attention_mask
,
output_all_encoded_layers
=
False
)
pooled_output
=
self
.
dropout
(
pooled_output
)
logits
=
self
.
classifier
(
pooled_output
)
reshaped_logits
=
logits
.
view
(
-
1
,
self
.
num_choices
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
reshaped_logits
,
labels
)
return
loss
else
:
return
reshaped_logits
class
BertForTokenClassification
(
PreTrainedBertModel
):
"""BERT model for token-level classification.
This module is composed of the BERT model with a linear layer on top of
the full hidden state of the last layer.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForTokenClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
,
num_labels
=
2
):
super
(
BertForTokenClassification
,
self
).
__init__
(
config
)
self
.
num_labels
=
num_labels
self
.
bert
=
BertModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
classifier
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
labels
=
None
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
sequence_output
=
self
.
dropout
(
sequence_output
)
logits
=
self
.
classifier
(
sequence_output
)
if
labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
self
.
num_labels
),
labels
.
view
(
-
1
))
return
loss
else
:
return
logits
class
BertForQuestionAnswering
(
PreTrainedBertModel
):
"""BERT model for Question Answering (span extraction).
This module is composed of the BERT model with a linear layer on top of
the sequence output that computes start_logits and end_logits
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
`end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
Positions are clamped to the length of the sequence and position outside of the sequence are not taken
into account for computing the loss.
Outputs:
if `start_positions` and `end_positions` are not `None`:
Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
if `start_positions` or `end_positions` is `None`:
Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
position tokens of shape [batch_size, sequence_length].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForQuestionAnswering(config)
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def
__init__
(
self
,
config
):
super
(
BertForQuestionAnswering
,
self
).
__init__
(
config
)
self
.
bert
=
BertModel
(
config
)
# TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
self
.
qa_outputs
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
self
.
apply
(
self
.
init_bert_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
,
start_positions
=
None
,
end_positions
=
None
):
sequence_output
,
_
=
self
.
bert
(
input_ids
,
token_type_ids
,
attention_mask
,
output_all_encoded_layers
=
False
)
logits
=
self
.
qa_outputs
(
sequence_output
)
start_logits
,
end_logits
=
logits
.
split
(
1
,
dim
=-
1
)
start_logits
=
start_logits
.
squeeze
(
-
1
)
end_logits
=
end_logits
.
squeeze
(
-
1
)
if
start_positions
is
not
None
and
end_positions
is
not
None
:
# If we are on multi-GPU, split add a dimension
if
len
(
start_positions
.
size
())
>
1
:
start_positions
=
start_positions
.
squeeze
(
-
1
)
if
len
(
end_positions
.
size
())
>
1
:
end_positions
=
end_positions
.
squeeze
(
-
1
)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index
=
start_logits
.
size
(
1
)
start_positions
.
clamp_
(
0
,
ignored_index
)
end_positions
.
clamp_
(
0
,
ignored_index
)
loss_fct
=
CrossEntropyLoss
(
ignore_index
=
ignored_index
)
start_loss
=
loss_fct
(
start_logits
,
start_positions
)
end_loss
=
loss_fct
(
end_logits
,
end_positions
)
total_loss
=
(
start_loss
+
end_loss
)
/
2
return
total_loss
else
:
return
start_logits
,
end_logits
Deepspeed/BingBertGlue/pytorch_pretrained_bert/optimization.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch optimization for BERT model."""
import
math
import
torch
from
torch.optim
import
Optimizer
from
torch.optim.optimizer
import
required
from
torch.nn.utils
import
clip_grad_norm_
def
warmup_cosine
(
x
,
warmup
=
0.002
):
if
x
<
warmup
:
return
x
/
warmup
return
0.5
*
(
1.0
+
torch
.
cos
(
math
.
pi
*
x
))
def
warmup_constant
(
x
,
warmup
=
0.002
):
if
x
<
warmup
:
return
x
/
warmup
return
1.0
def
warmup_linear
(
x
,
warmup
=
0.002
):
if
warmup
==
0.0
:
return
1.0
elif
x
<
warmup
:
return
x
/
warmup
return
1.0
-
x
def
warmup_linear_decay_exp
(
global_step
,
decay_rate
,
decay_steps
,
total_steps
,
warmup
=
0.002
):
x
=
global_step
/
total_steps
warmup_end
=
warmup
*
total_steps
if
warmup
==
0.0
:
return
1.0
elif
x
<
warmup
:
return
x
/
warmup
return
decay_rate
**
((
global_step
-
warmup_end
)
/
decay_steps
)
def
warmup_exp_decay_exp
(
global_step
,
decay_rate
,
decay_steps
,
total_steps
,
warmup
=
0.002
,
degree
=
2.0
):
x
=
global_step
/
total_steps
warmup_end
=
warmup
*
total_steps
if
warmup
==
0.0
:
return
1.0
elif
x
<
warmup
:
return
(
x
/
warmup
)
**
degree
return
decay_rate
**
((
global_step
-
warmup_end
)
/
decay_steps
)
def
warmup_exp_decay_poly
(
global_step
,
total_steps
,
warmup
=
0.002
,
warm_degree
=
1.5
,
degree
=
2.0
):
x
=
global_step
/
total_steps
if
x
<
warmup
:
return
(
x
/
warmup
)
**
warm_degree
return
(
1.0
-
x
)
**
degree
SCHEDULES
=
{
'warmup_cosine'
:
warmup_cosine
,
'warmup_constant'
:
warmup_constant
,
'warmup_linear'
:
warmup_linear
,
'warmup_linear_decay_exp'
:
warmup_linear_decay_exp
,
'warmup_exp_decay_poly'
:
warmup_exp_decay_poly
,
'warmup_exp_decay_exp'
:
warmup_exp_decay_exp
}
class
BertAdam
(
Optimizer
):
"""Implements BERT version of Adam algorithm with weight decay fix.
Params:
lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
t_total: total number of training steps for the learning
rate schedule, -1 means constant learning rate. Default: -1
schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
b1: Adams b1. Default: 0.9
b2: Adams b2. Default: 0.999
e: Adams epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
"""
def
__init__
(
self
,
params
,
lr
=
required
,
warmup
=-
1
,
t_total
=-
1
,
schedule
=
'warmup_linear'
,
b1
=
0.9
,
b2
=
0.999
,
e
=
1e-6
,
weight_decay
=
0.01
,
max_grad_norm
=
1.0
):
if
lr
is
not
required
and
lr
<
0.0
:
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
if
schedule
not
in
SCHEDULES
:
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
if
not
0.0
<=
warmup
<
1.0
and
not
warmup
==
-
1
:
raise
ValueError
(
"Invalid warmup: {} - should be in [0.0, 1.0[ or -1"
.
format
(
warmup
))
if
not
0.0
<=
b1
<
1.0
:
raise
ValueError
(
"Invalid b1 parameter: {} - should be in [0.0, 1.0["
.
format
(
b1
))
if
not
0.0
<=
b2
<
1.0
:
raise
ValueError
(
"Invalid b2 parameter: {} - should be in [0.0, 1.0["
.
format
(
b2
))
if
not
e
>=
0.0
:
raise
ValueError
(
"Invalid epsilon value: {} - should be >= 0.0"
.
format
(
e
))
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
warmup
=
warmup
,
t_total
=
t_total
,
b1
=
b1
,
b2
=
b2
,
e
=
e
,
weight_decay
=
weight_decay
,
max_grad_norm
=
max_grad_norm
)
super
(
BertAdam
,
self
).
__init__
(
params
,
defaults
)
def
get_lr
(
self
):
lr
=
[]
for
group
in
self
.
param_groups
:
for
p
in
group
[
'params'
]:
state
=
self
.
state
[
p
]
if
len
(
state
)
==
0
:
return
[
0
]
if
group
[
't_total'
]
!=
-
1
:
schedule_fct
=
SCHEDULES
[
group
[
'schedule'
]]
lr_scheduled
=
group
[
'lr'
]
*
schedule_fct
(
state
[
'step'
]
/
group
[
't_total'
],
group
[
'warmup'
])
else
:
lr_scheduled
=
group
[
'lr'
]
lr
.
append
(
lr_scheduled
)
return
lr
def
step
(
self
,
closure
=
None
):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss
=
None
if
closure
is
not
None
:
loss
=
closure
()
for
group
in
self
.
param_groups
:
for
p
in
group
[
'params'
]:
if
p
.
grad
is
None
:
continue
grad
=
p
.
grad
.
data
if
grad
.
is_sparse
:
raise
RuntimeError
(
'Adam does not support sparse gradients, please consider SparseAdam instead'
)
state
=
self
.
state
[
p
]
# State initialization
if
len
(
state
)
==
0
:
state
[
'step'
]
=
0
# Exponential moving average of gradient values
state
[
'next_m'
]
=
torch
.
zeros_like
(
p
.
data
)
# Exponential moving average of squared gradient values
state
[
'next_v'
]
=
torch
.
zeros_like
(
p
.
data
)
next_m
,
next_v
=
state
[
'next_m'
],
state
[
'next_v'
]
beta1
,
beta2
=
group
[
'b1'
],
group
[
'b2'
]
# Add grad clipping
if
group
[
'max_grad_norm'
]
>
0
:
clip_grad_norm_
(
p
,
group
[
'max_grad_norm'
])
# Decay the first and second moment running average coefficient
# In-place operations to update the averages at the same time
next_m
.
mul_
(
beta1
).
add_
(
1
-
beta1
,
grad
)
next_v
.
mul_
(
beta2
).
addcmul_
(
1
-
beta2
,
grad
,
grad
)
update
=
next_m
/
(
next_v
.
sqrt
()
+
group
[
'e'
])
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if
group
[
'weight_decay'
]
>
0.0
:
update
+=
group
[
'weight_decay'
]
*
p
.
data
if
group
[
't_total'
]
!=
-
1
:
schedule_fct
=
SCHEDULES
[
group
[
'schedule'
]]
lr_scheduled
=
group
[
'lr'
]
*
schedule_fct
(
state
[
'step'
]
/
group
[
't_total'
],
group
[
'warmup'
])
else
:
lr_scheduled
=
group
[
'lr'
]
update_with_lr
=
lr_scheduled
*
update
p
.
data
.
add_
(
-
update_with_lr
)
state
[
'step'
]
+=
1
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# No bias correction
# bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step']
return
loss
Deepspeed/BingBertGlue/pytorch_pretrained_bert/tokenization.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
unicodedata
import
os
import
logging
from
.file_utils
import
cached_path
logger
=
logging
.
getLogger
(
__name__
)
PRETRAINED_VOCAB_ARCHIVE_MAP
=
{
'bert-base-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
,
'bert-large-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt"
,
'bert-base-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt"
,
'bert-large-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt"
,
'bert-base-multilingual-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt"
,
'bert-base-multilingual-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt"
,
'bert-base-chinese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt"
,
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
=
{
'bert-base-uncased'
:
512
,
'bert-large-uncased'
:
512
,
'bert-base-cased'
:
512
,
'bert-large-cased'
:
512
,
'bert-base-multilingual-uncased'
:
512
,
'bert-base-multilingual-cased'
:
512
,
'bert-base-chinese'
:
512
,
}
VOCAB_NAME
=
'vocab.txt'
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
index
=
0
with
open
(
vocab_file
,
"r"
,
encoding
=
"utf-8"
)
as
reader
:
while
True
:
token
=
reader
.
readline
()
if
not
token
:
break
token
=
token
.
strip
()
vocab
[
token
]
=
index
index
+=
1
return
vocab
def
whitespace_tokenize
(
text
):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text
=
text
.
strip
()
if
not
text
:
return
[]
tokens
=
text
.
split
()
return
tokens
class
BertTokenizer
(
object
):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
max_len
=
None
,
never_split
=
(
"[UNK]"
,
"[SEP]"
,
"[PAD]"
,
"[CLS]"
,
"[MASK]"
)):
if
not
os
.
path
.
isfile
(
vocab_file
):
raise
ValueError
(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
.
format
(
vocab_file
))
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
ids_to_tokens
=
collections
.
OrderedDict
([
(
ids
,
tok
)
for
tok
,
ids
in
self
.
vocab
.
items
()
])
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
def
tokenize
(
self
,
text
):
split_tokens
=
[]
for
token
in
self
.
basic_tokenizer
.
tokenize
(
text
):
for
sub_token
in
self
.
wordpiece_tokenizer
.
tokenize
(
token
):
split_tokens
.
append
(
sub_token
)
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
"""Converts a sequence of tokens into ids using the vocab."""
ids
=
[]
for
token
in
tokens
:
ids
.
append
(
self
.
vocab
[
token
])
if
len
(
ids
)
>
self
.
max_len
:
raise
ValueError
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
ids
def
convert_ids_to_tokens
(
self
,
ids
):
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
tokens
=
[]
for
i
in
ids
:
tokens
.
append
(
self
.
ids_to_tokens
[
i
])
return
tokens
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if
pretrained_model_name
in
PRETRAINED_VOCAB_ARCHIVE_MAP
:
vocab_file
=
PRETRAINED_VOCAB_ARCHIVE_MAP
[
pretrained_model_name
]
else
:
vocab_file
=
pretrained_model_name
if
os
.
path
.
isdir
(
vocab_file
):
vocab_file
=
os
.
path
.
join
(
vocab_file
,
VOCAB_NAME
)
# redirect to the cache, if necessary
try
:
resolved_vocab_file
=
cached_path
(
vocab_file
,
cache_dir
=
cache_dir
)
except
FileNotFoundError
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url."
.
format
(
pretrained_model_name
,
', '
.
join
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
()),
vocab_file
))
return
None
if
resolved_vocab_file
==
vocab_file
:
logger
.
info
(
"loading vocabulary file {}"
.
format
(
vocab_file
))
else
:
logger
.
info
(
"loading vocabulary file {} from cache at {}"
.
format
(
vocab_file
,
resolved_vocab_file
))
if
pretrained_model_name
in
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len
=
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
[
pretrained_model_name
]
kwargs
[
'max_len'
]
=
min
(
kwargs
.
get
(
'max_len'
,
int
(
1e12
)),
max_len
)
# Instantiate tokenizer.
tokenizer
=
cls
(
resolved_vocab_file
,
*
inputs
,
**
kwargs
)
return
tokenizer
class
BasicTokenizer
(
object
):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def
__init__
(
self
,
do_lower_case
=
True
,
never_split
=
(
"[UNK]"
,
"[SEP]"
,
"[PAD]"
,
"[CLS]"
,
"[MASK]"
)):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
never_split
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
for
token
in
orig_tokens
:
if
self
.
do_lower_case
and
token
not
in
self
.
never_split
:
token
=
token
.
lower
()
token
=
self
.
_run_strip_accents
(
token
)
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
output_tokens
=
whitespace_tokenize
(
" "
.
join
(
split_tokens
))
return
output_tokens
def
_run_strip_accents
(
self
,
text
):
"""Strips accents from a piece of text."""
text
=
unicodedata
.
normalize
(
"NFD"
,
text
)
output
=
[]
for
char
in
text
:
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Mn"
:
continue
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_run_split_on_punc
(
self
,
text
):
"""Splits punctuation on a piece of text."""
if
text
in
self
.
never_split
:
return
[
text
]
chars
=
list
(
text
)
i
=
0
start_new_word
=
True
output
=
[]
while
i
<
len
(
chars
):
char
=
chars
[
i
]
if
_is_punctuation
(
char
):
output
.
append
([
char
])
start_new_word
=
True
else
:
if
start_new_word
:
output
.
append
([])
start_new_word
=
False
output
[
-
1
].
append
(
char
)
i
+=
1
return
[
""
.
join
(
x
)
for
x
in
output
]
def
_tokenize_chinese_chars
(
self
,
text
):
"""Adds whitespace around any CJK character."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
self
.
_is_chinese_char
(
cp
):
output
.
append
(
" "
)
output
.
append
(
char
)
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
((
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
#
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
or
#
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
or
#
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
or
#
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
or
#
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
#
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)):
#
return
True
return
False
def
_clean_text
(
self
,
text
):
"""Performs invalid character removal and whitespace cleanup on text."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
cp
==
0
or
cp
==
0xfffd
or
_is_control
(
char
):
continue
if
_is_whitespace
(
char
):
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
class
WordpieceTokenizer
(
object
):
"""Runs WordPiece tokenization."""
def
__init__
(
self
,
vocab
,
unk_token
=
"[UNK]"
,
max_input_chars_per_word
=
100
):
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
max_input_chars_per_word
=
max_input_chars_per_word
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
A list of wordpiece tokens.
"""
output_tokens
=
[]
for
token
in
whitespace_tokenize
(
text
):
chars
=
list
(
token
)
if
len
(
chars
)
>
self
.
max_input_chars_per_word
:
output_tokens
.
append
(
self
.
unk_token
)
continue
is_bad
=
False
start
=
0
sub_tokens
=
[]
while
start
<
len
(
chars
):
end
=
len
(
chars
)
cur_substr
=
None
while
start
<
end
:
substr
=
""
.
join
(
chars
[
start
:
end
])
if
start
>
0
:
substr
=
"##"
+
substr
if
substr
in
self
.
vocab
:
cur_substr
=
substr
break
end
-=
1
if
cur_substr
is
None
:
is_bad
=
True
break
sub_tokens
.
append
(
cur_substr
)
start
=
end
if
is_bad
:
output_tokens
.
append
(
self
.
unk_token
)
else
:
output_tokens
.
extend
(
sub_tokens
)
return
output_tokens
def
_is_whitespace
(
char
):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if
char
==
" "
or
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Zs"
:
return
True
return
False
def
_is_control
(
char
):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
False
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"C"
):
return
True
return
False
def
_is_punctuation
(
char
):
"""Checks whether `chars` is a punctuation character."""
cp
=
ord
(
char
)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if
((
cp
>=
33
and
cp
<=
47
)
or
(
cp
>=
58
and
cp
<=
64
)
or
(
cp
>=
91
and
cp
<=
96
)
or
(
cp
>=
123
and
cp
<=
126
)):
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"P"
):
return
True
return
False
Deepspeed/BingBertGlue/run_glue_bert_base_finetune.sh
0 → 100644
View file @
316d3f90
LOG_DIR
=
"log"
if
[
!
-d
"
$LOG_DIR
"
]
;
then
mkdir
$LOG_DIR
fi
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
>
/dev/null 2>&1
&&
pwd
)
"
NGPU
=
8
echo
"Started scripts"
TASK
=
RTE
EFFECTIVE_BATCH_SIZE
=
16
LR
=
2e-5
NUM_EPOCH
=
3
base_dir
=
`
pwd
`
model_name
=
"bert_base"
JOBNAME
=
test
CHECKPOINT_PATH
=
$1
OUTPUT_DIR
=
"
${
SCRIPT_DIR
}
/outputs/
${
model_name
}
/
${
JOBNAME
}
_bsz
${
EFFECTIVE_BATCH_SIZE
}
_lr
${
LR
}
_epoch
${
NUM_EPOCH
}
"
GLUE_DIR
=
"/data/GlueData"
MAX_GPU_BATCH_SIZE
=
1
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
echo
"Fine Tuning
$CHECKPOINT_PATH
"
run_cmd
=
"deepspeed
\
--num_nodes 2
\
--num_gpus
${
NGPU
}
\
run_glue_classifier_bert_base.py
\
--task_name
$TASK
\
--do_train
\
--do_eval
\
--deepspeed
\
--preln
\
--deepspeed_config
${
base_dir
}
/glue_bert_base.json
\
--do_lower_case
\
--data_dir
$GLUE_DIR
/
$TASK
/
\
--bert_model bert-base-uncased
\
--max_seq_length 128
\
--train_batch_size
${
PER_GPU_BATCH_SIZE
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--learning_rate
${
LR
}
\
--num_train_epochs
${
NUM_EPOCH
}
\
--output_dir
${
OUTPUT_DIR
}
_
${
TASK
}
\
--model_file
${
CHECKPOINT_PATH
}
"
echo
${
run_cmd
}
eval
${
run_cmd
}
Deepspeed/BingBertGlue/run_glue_bert_large_finetune.sh
0 → 100644
View file @
316d3f90
LOG_DIR
=
"log"
if
[
!
-d
"
$LOG_DIR
"
]
;
then
mkdir
$LOG_DIR
fi
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
>
/dev/null 2>&1
&&
pwd
)
"
NGPU
=
8
echo
"Started scripts"
TASK
=
RTE
EFFECTIVE_BATCH_SIZE
=
16
LR
=
2e-5
NUM_EPOCH
=
3
base_dir
=
`
pwd
`
model_name
=
"bert_large"
JOBNAME
=
test
CHECKPOINT_PATH
=
$1
OUTPUT_DIR
=
"
${
SCRIPT_DIR
}
/outputs/
${
model_name
}
/
${
JOBNAME
}
_bsz
${
EFFECTIVE_BATCH_SIZE
}
_lr
${
LR
}
_epoch
${
NUM_EPOCH
}
"
GLUE_DIR
=
"/data/GlueData"
MAX_GPU_BATCH_SIZE
=
32
PER_GPU_BATCH_SIZE
=
$((
EFFECTIVE_BATCH_SIZE/NGPU
))
if
[[
$PER_GPU_BATCH_SIZE
-lt
$MAX_GPU_BATCH_SIZE
]]
;
then
GRAD_ACCUM_STEPS
=
1
else
GRAD_ACCUM_STEPS
=
$((
PER_GPU_BATCH_SIZE/MAX_GPU_BATCH_SIZE
))
fi
echo
"Fine Tuning
$CHECKPOINT_PATH
"
run_cmd
=
"deepspeed
\
--num_nodes 2
\
--num_gpus
${
NGPU
}
\
run_glue_classifier_bert_large.py
\
--task_name
$TASK
\
--do_train
\
--do_eval
\
--deepspeed
\
--preln
\
--deepspeed_config
${
base_dir
}
/glue_bert_large.json
\
--do_lower_case
\
--data_dir
$GLUE_DIR
/
$TASK
/
\
--bert_model bert-large-uncased
\
--max_seq_length 128
\
--train_batch_size
${
PER_GPU_BATCH_SIZE
}
\
--gradient_accumulation_steps
${
GRAD_ACCUM_STEPS
}
\
--learning_rate
${
LR
}
\
--num_train_epochs
${
NUM_EPOCH
}
\
--output_dir
${
OUTPUT_DIR
}
_
${
TASK
}
\
--model_file
$CHECKPOINT_PATH
"
echo
${
run_cmd
}
eval
${
run_cmd
}
Deepspeed/BingBertGlue/run_glue_classifier_bert_base.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from
__future__
import
absolute_import
,
division
,
print_function
import
argparse
import
csv
import
logging
import
os
import
random
import
sys
import
deepspeed
import
numpy
as
np
import
torch
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
TensorDataset
)
from
torch.utils.data.distributed
import
DistributedSampler
from
tqdm
import
tqdm
,
trange
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
scipy.stats
import
pearsonr
,
spearmanr
from
sklearn.metrics
import
matthews_corrcoef
,
f1_score
from
pytorch_pretrained_bert.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
from
pytorch_pretrained_bert.modeling
import
WEIGHTS_NAME
,
CONFIG_NAME
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.optimization
import
BertAdam
,
warmup_linear
from
turing.loss
import
FocalLoss
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
class
InputExample
(
object
):
"""A single training/test example for simple sequence classification."""
def
__init__
(
self
,
guid
,
text_a
,
text_b
=
None
,
label
=
None
):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self
.
guid
=
guid
self
.
text_a
=
text_a
self
.
text_b
=
text_b
self
.
label
=
label
class
InputFeatures
(
object
):
"""A single set of features of data."""
def
__init__
(
self
,
input_ids
,
input_mask
,
segment_ids
,
label_id
):
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
segment_ids
=
segment_ids
self
.
label_id
=
label_id
class
DataProcessor
(
object
):
"""Base class for data converters for sequence classification data sets."""
def
get_train_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for the train set."""
raise
NotImplementedError
()
def
get_dev_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for the dev set."""
raise
NotImplementedError
()
def
get_labels
(
self
):
"""Gets the list of labels for this data set."""
raise
NotImplementedError
()
@
classmethod
def
_read_tsv
(
cls
,
input_file
,
quotechar
=
None
):
"""Reads a tab separated value file."""
with
open
(
input_file
,
"r"
,
encoding
=
'utf-8'
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
lines
=
[]
for
line
in
reader
:
if
sys
.
version_info
[
0
]
==
2
:
line
=
list
(
unicode
(
cell
,
'utf-8'
)
for
cell
in
line
)
lines
.
append
(
line
)
return
lines
class
MrpcProcessor
(
DataProcessor
):
"""Processor for the MRPC data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
"LOOKING AT {}"
.
format
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)))
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
line
[
3
]
text_b
=
line
[
4
]
label
=
line
[
0
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
MnliProcessor
(
DataProcessor
):
"""Processor for the MultiNLI data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev_matched.tsv"
)),
"dev_matched"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"contradiction"
,
"entailment"
,
"neutral"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
8
]
text_b
=
line
[
9
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
MnliMismatchedProcessor
(
MnliProcessor
):
"""Processor for the MultiNLI Mismatched data set (GLUE version)."""
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev_mismatched.tsv"
)),
"dev_matched"
)
class
ColaProcessor
(
DataProcessor
):
"""Processor for the CoLA data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
line
[
3
]
label
=
line
[
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
None
,
label
=
label
))
return
examples
class
Sst2Processor
(
DataProcessor
):
"""Processor for the SST-2 data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
line
[
0
]
label
=
line
[
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
None
,
label
=
label
))
return
examples
class
StsbProcessor
(
DataProcessor
):
"""Processor for the STS-B data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
None
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
7
]
text_b
=
line
[
8
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
QqpProcessor
(
DataProcessor
):
"""Processor for the STS-B data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
try
:
text_a
=
line
[
3
]
text_b
=
line
[
4
]
label
=
line
[
5
]
except
IndexError
:
continue
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
QnliProcessor
(
DataProcessor
):
"""Processor for the STS-B data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev_matched"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"entailment"
,
"not_entailment"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
1
]
text_b
=
line
[
2
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
RteProcessor
(
DataProcessor
):
"""Processor for the RTE data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"entailment"
,
"not_entailment"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
1
]
text_b
=
line
[
2
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
WnliProcessor
(
DataProcessor
):
"""Processor for the WNLI data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
1
]
text_b
=
line
[
2
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
def
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
output_mode
):
"""Loads a data file into a list of `InputBatch`s."""
label_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
features
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
if
ex_index
%
10000
==
0
:
logger
.
info
(
"Writing example %d of %d"
%
(
ex_index
,
len
(
examples
)))
tokens_a
=
tokenizer
.
tokenize
(
example
.
text_a
)
tokens_b
=
None
if
example
.
text_b
:
tokens_b
=
tokenizer
.
tokenize
(
example
.
text_b
)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_seq_length
-
3
)
else
:
# Account for [CLS] and [SEP] with "- 2"
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
tokens_a
=
tokens_a
[:(
max_seq_length
-
2
)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens
=
[
"[CLS]"
]
+
tokens_a
+
[
"[SEP]"
]
segment_ids
=
[
0
]
*
len
(
tokens
)
if
tokens_b
:
tokens
+=
tokens_b
+
[
"[SEP]"
]
segment_ids
+=
[
1
]
*
(
len
(
tokens_b
)
+
1
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask
=
[
1
]
*
len
(
input_ids
)
# Zero-pad up to the sequence length.
padding
=
[
0
]
*
(
max_seq_length
-
len
(
input_ids
))
input_ids
+=
padding
input_mask
+=
padding
segment_ids
+=
padding
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
if
output_mode
==
"classification"
:
label_id
=
label_map
[
example
.
label
]
elif
output_mode
==
"regression"
:
label_id
=
float
(
example
.
label
)
else
:
raise
KeyError
(
output_mode
)
if
ex_index
<
5
:
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"tokens: %s"
%
" "
.
join
(
[
str
(
x
)
for
x
in
tokens
]))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logger
.
info
(
"segment_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
segment_ids
]))
logger
.
info
(
"label: %s (id = %d)"
%
(
example
.
label
,
label_id
))
features
.
append
(
InputFeatures
(
input_ids
=
input_ids
,
input_mask
=
input_mask
,
segment_ids
=
segment_ids
,
label_id
=
label_id
))
return
features
def
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_length
):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_length
:
break
if
len
(
tokens_a
)
>
len
(
tokens_b
):
tokens_a
.
pop
()
else
:
tokens_b
.
pop
()
def
simple_accuracy
(
preds
,
labels
):
return
(
preds
==
labels
).
mean
()
def
acc_and_f1
(
preds
,
labels
):
acc
=
simple_accuracy
(
preds
,
labels
)
f1
=
f1_score
(
y_true
=
labels
,
y_pred
=
preds
)
return
{
"acc"
:
acc
,
"f1"
:
f1
,
"acc_and_f1"
:
(
acc
+
f1
)
/
2
,
}
def
pearson_and_spearman
(
preds
,
labels
):
pearson_corr
=
pearsonr
(
preds
,
labels
)[
0
]
spearman_corr
=
spearmanr
(
preds
,
labels
)[
0
]
return
{
"pearson"
:
pearson_corr
,
"spearmanr"
:
spearman_corr
,
"corr"
:
(
pearson_corr
+
spearman_corr
)
/
2
,
}
def
compute_metrics
(
task_name
,
preds
,
labels
):
assert
len
(
preds
)
==
len
(
labels
)
if
task_name
==
"cola"
:
return
{
"mcc"
:
matthews_corrcoef
(
labels
,
preds
)}
elif
task_name
==
"sst-2"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"mrpc"
:
return
acc_and_f1
(
preds
,
labels
)
elif
task_name
==
"sts-b"
:
return
pearson_and_spearman
(
preds
,
labels
)
elif
task_name
==
"qqp"
:
return
acc_and_f1
(
preds
,
labels
)
elif
task_name
==
"mnli"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"mnli-mm"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"qnli"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"rte"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"wnli"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
else
:
raise
KeyError
(
task_name
)
def
main
():
parser
=
argparse
.
ArgumentParser
()
# Required parameters
parser
.
add_argument
(
"--data_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .tsv files (or other data files) for the task."
)
parser
.
add_argument
(
"--bert_model"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese."
)
parser
.
add_argument
(
"--task_name"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The name of the task to train."
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model predictions and checkpoints will be written."
)
# Other parameters
parser
.
add_argument
(
"--cache_dir"
,
default
=
""
,
type
=
str
,
help
=
"Where do you want to store the pre-trained models downloaded from s3"
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization.
\n
"
"Sequences longer than this will be truncated, and sequences shorter
\n
"
"than this will be padded."
)
parser
.
add_argument
(
"--do_train"
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
help
=
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training."
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
0
,
help
=
"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
\n
"
"0 (default value): dynamic loss scaling.
\n
"
"Positive power of 2: static loss scaling value.
\n
"
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
"--model_file"
,
type
=
str
,
default
=
"0"
,
help
=
"Path to the Pretrained BERT Encoder File."
)
parser
.
add_argument
(
'--random'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to fientune for random initialization"
)
parser
.
add_argument
(
'--focal'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to use Focal Loss for finetuning."
)
parser
.
add_argument
(
'--gamma'
,
type
=
float
,
default
=
0.5
,
help
=
"Gamma parameter to be used in focal loss."
)
parser
.
add_argument
(
'--deepspeed_sparse_attention'
,
default
=
False
,
action
=
'store_true'
,
help
=
'Use DeepSpeed sparse self attention.'
)
parser
.
add_argument
(
'--deepspeed_transformer_kernel'
,
default
=
False
,
action
=
'store_true'
,
help
=
'Use DeepSpeed transformer kernel to accelerate.'
)
parser
.
add_argument
(
'--progressive_layer_drop'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to enable progressive layer dropping or not"
)
parser
.
add_argument
(
'--preln'
,
action
=
'store_true'
,
default
=
False
,
help
=
"Switching to the variant of Transformer blocks that use pre-LayerNorm."
)
parser
=
deepspeed
.
add_config_arguments
(
parser
)
args
=
parser
.
parse_args
()
if
args
.
server_ip
and
args
.
server_port
:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import
ptvsd
print
(
"Waiting for debugger attach"
)
ptvsd
.
enable_attach
(
address
=
(
args
.
server_ip
,
args
.
server_port
),
redirect_output
=
True
)
ptvsd
.
wait_for_attach
()
processors
=
{
"cola"
:
ColaProcessor
,
"mnli"
:
MnliProcessor
,
"mnli-mm"
:
MnliMismatchedProcessor
,
"mrpc"
:
MrpcProcessor
,
"sst-2"
:
Sst2Processor
,
"sts-b"
:
StsbProcessor
,
"qqp"
:
QqpProcessor
,
"qnli"
:
QnliProcessor
,
"rte"
:
RteProcessor
,
"wnli"
:
WnliProcessor
,
}
output_modes
=
{
"cola"
:
"classification"
,
"mnli"
:
"classification"
,
"mrpc"
:
"classification"
,
"sst-2"
:
"classification"
,
"sts-b"
:
"regression"
,
"qqp"
:
"classification"
,
"qnli"
:
"classification"
,
"rte"
:
"classification"
,
"wnli"
:
"classification"
,
}
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
logger
.
info
(
"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
.
format
(
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
))
if
args
.
gradient_accumulation_steps
<
1
:
raise
ValueError
(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
.
format
(
args
.
gradient_accumulation_steps
))
args
.
train_batch_size
=
args
.
train_batch_size
//
args
.
gradient_accumulation_steps
args
.
seed
=
random
.
randint
(
1
,
1000
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
if
not
args
.
do_train
and
not
args
.
do_eval
:
raise
ValueError
(
"At least one of `do_train` or `do_eval` must be True."
)
if
(
torch
.
distributed
.
get_rank
()
==
0
):
if
not
os
.
path
.
exists
(
args
.
output_dir
):
os
.
makedirs
(
args
.
output_dir
)
torch
.
distributed
.
barrier
()
task_name
=
args
.
task_name
.
lower
()
if
task_name
not
in
processors
:
raise
ValueError
(
"Task not found: %s"
%
(
task_name
))
processor
=
processors
[
task_name
]()
output_mode
=
output_modes
[
task_name
]
label_list
=
processor
.
get_labels
()
num_labels
=
len
(
label_list
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
bert_model
,
do_lower_case
=
args
.
do_lower_case
)
train_examples
=
None
num_train_optimization_steps
=
None
if
args
.
do_train
:
train_examples
=
processor
.
get_train_examples
(
args
.
data_dir
)
num_train_optimization_steps
=
int
(
len
(
train_examples
)
/
args
.
train_batch_size
/
args
.
gradient_accumulation_steps
)
*
args
.
num_train_epochs
if
args
.
local_rank
!=
-
1
:
num_train_optimization_steps
=
num_train_optimization_steps
//
torch
.
distributed
.
get_world_size
()
# Prepare model
cache_dir
=
args
.
cache_dir
if
args
.
cache_dir
else
os
.
path
.
join
(
str
(
PYTORCH_PRETRAINED_BERT_CACHE
),
'distributed_{}'
.
format
(
args
.
local_rank
))
bert_base_model_config
=
{
"vocab_size_or_config_json_file"
:
119547
,
"hidden_size"
:
768
,
"num_hidden_layers"
:
12
,
"num_attention_heads"
:
12
,
"intermediate_size"
:
3072
,
"hidden_act"
:
"gelu"
,
"hidden_dropout_prob"
:
0.1
,
"attention_probs_dropout_prob"
:
0.1
,
"max_position_embeddings"
:
512
,
"type_vocab_size"
:
2
,
"initializer_range"
:
0.02
}
if
args
.
progressive_layer_drop
:
print
(
"BertBaseConfigPreLnLayerDrop"
)
from
nvidia.modelingpreln_layerdrop
import
BertForSequenceClassification
,
BertConfig
elif
args
.
preln
:
from
nvidia.modelingpreln
import
BertForSequenceClassification
,
BertConfig
,
BertLayer
else
:
from
nvidia.modeling
import
BertForSequenceClassification
,
BertConfig
,
BertLayer
bert_config
=
BertConfig
(
**
bert_base_model_config
)
bert_config
.
vocab_size
=
len
(
tokenizer
.
vocab
)
# Padding for divisibility by 8
if
bert_config
.
vocab_size
%
8
!=
0
:
bert_config
.
vocab_size
+=
8
-
(
bert_config
.
vocab_size
%
8
)
model
=
BertForSequenceClassification
(
args
,
bert_config
,
num_labels
=
num_labels
)
if
args
.
model_file
is
not
"0"
:
logger
.
info
(
f
"Loading Pretrained Bert Encoder from:
{
args
.
model_file
}
"
)
checkpoint_state_dict
=
torch
.
load
(
args
.
model_file
,
map_location
=
torch
.
device
(
"cpu"
))
if
'module'
in
checkpoint_state_dict
:
logger
.
info
(
'Loading DeepSpeed v2.0 style checkpoint'
)
model
.
load_state_dict
(
checkpoint_state_dict
[
'module'
],
strict
=
False
)
elif
'model_state_dict'
in
checkpoint_state_dict
:
model
.
load_state_dict
(
checkpoint_state_dict
[
'model_state_dict'
],
strict
=
False
)
else
:
raise
ValueError
(
"Unable to find model state in checkpoint"
)
logger
.
info
(
f
"Pretrained Bert Encoder Loaded from:
{
args
.
model_file
}
"
)
if
args
.
random
:
logger
.
info
(
"USING RANDOM INITIALISATION FOR FINETUNING"
)
model
.
apply
(
model
.
init_bert_weights
)
if
args
.
fp16
:
model
.
half
()
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
try
:
if
args
.
deepscale
:
print
(
"Enabling DeepScale"
)
from
deepscale.distributed_apex
import
DistributedDataParallel
as
DDP
else
:
from
apex.parallel
import
DistributedDataParallel
as
DDP
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
model
=
DDP
(
model
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
if
args
.
deepspeed_transformer_kernel
:
no_decay
=
no_decay
+
[
'attn_nw'
,
'attn_nb'
,
'norm_w'
,
'norm_b'
,
'attn_qkvb'
,
'attn_ob'
,
'inter_b'
,
'output_b'
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
model
,
optimizer
,
_
,
_
=
deepspeed
.
initialize
(
args
=
args
,
model
=
model
,
model_parameters
=
optimizer_grouped_parameters
,
dist_init_required
=
True
)
global_step
=
0
nb_tr_steps
=
0
tr_loss
=
0
if
args
.
do_train
:
train_features
=
convert_examples_to_features
(
train_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
logger
.
info
(
" Num steps = %d"
,
num_train_optimization_steps
)
all_input_ids
=
torch
.
tensor
(
[
f
.
input_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
(
[
f
.
input_mask
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
(
[
f
.
segment_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
(
[
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
if
args
.
fp16
:
all_label_ids
=
torch
.
tensor
(
[
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
half
)
else
:
all_label_ids
=
torch
.
tensor
(
[
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
float
)
train_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
if
args
.
local_rank
==
-
1
:
train_sampler
=
RandomSampler
(
train_data
)
else
:
train_sampler
=
DistributedSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
model
.
train
()
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
):
tr_loss
=
0
nb_tr_examples
,
nb_tr_steps
=
0
,
0
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
)):
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
,
input_mask
,
segment_ids
,
label_ids
=
batch
# define a new function to compute loss values for both output_modes
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
,
labels
=
None
)
if
output_mode
==
"classification"
:
if
args
.
focal
:
loss_fct
=
FocalLoss
(
class_num
=
num_labels
,
gamma
=
args
.
gamma
)
else
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
elif
output_mode
==
"regression"
:
loss_fct
=
MSELoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
),
label_ids
.
view
(
-
1
))
if
n_gpu
>
1
:
loss
=
loss
.
mean
()
# mean() to average on multi-gpu.
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
if
args
.
deepscale
and
args
.
local_rank
!=
-
1
:
model
.
disable_need_reduction
()
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
model
.
enable_need_reduction
()
if
args
.
fp16
:
optimizer
.
backward
(
loss
)
else
:
loss
.
backward
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
\
warmup_linear
(
global_step
/
num_train_optimization_steps
,
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
zero_grad
()
global_step
+=
1
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
(
[
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
(
[
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
(
[
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
(
[
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
all_label_ids
=
torch
.
tensor
(
[
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
float
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
,
labels
=
None
)
# create eval loss and other metric required by the task
if
output_mode
==
"classification"
:
if
args
.
focal
:
loss_fct
=
FocalLoss
(
class_num
=
num_labels
,
gamma
=
args
.
gamma
)
else
:
loss_fct
=
CrossEntropyLoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
elif
output_mode
==
"regression"
:
loss_fct
=
MSELoss
()
print
(
logits
.
type
())
print
(
label_ids
.
type
())
if
task_name
==
"sts-b"
:
tmp_eval_loss
=
loss_fct
(
logits
.
float
().
view
(
-
1
),
label_ids
.
view
(
-
1
))
else
:
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
),
label_ids
.
view
(
-
1
))
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
if
output_mode
==
"classification"
:
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
elif
output_mode
==
"regression"
:
preds
=
np
.
squeeze
(
preds
)
result
=
compute_metrics
(
task_name
,
preds
,
all_label_ids
.
numpy
())
loss
=
tr_loss
/
nb_tr_steps
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
# hack for MNLI-MM
if
task_name
==
"mnli"
:
task_name
=
"mnli-mm"
processor
=
processors
[
task_name
]()
if
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
)
and
os
.
listdir
(
args
.
output_dir
+
'-MM'
)
and
args
.
do_train
:
raise
ValueError
(
"Output directory ({}{}) already exists and is not empty."
.
format
(
args
.
output_dir
,
'-MM'
))
if
not
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
):
os
.
makedirs
(
args
.
output_dir
+
'-MM'
)
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
(
[
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
(
[
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
(
[
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_label_ids
=
torch
.
tensor
(
[
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
,
labels
=
None
)
if
args
.
focal
:
loss_fct
=
FocalLoss
(
class_num
=
num_labels
,
gamma
=
args
.
gamma
)
else
:
loss_fct
=
CrossEntropyLoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
result
=
compute_metrics
(
task_name
,
preds
,
all_label_ids
.
numpy
())
loss
=
tr_loss
/
nb_tr_steps
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
+
'-MM'
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"__main__"
:
main
()
Deepspeed/BingBertGlue/run_glue_classifier_bert_large.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from
__future__
import
absolute_import
,
division
,
print_function
import
sys
sys
.
path
=
[
'/home/t-yuchenglu/DeepSpeed'
]
+
sys
.
path
import
argparse
import
csv
import
logging
import
os
import
random
import
deepspeed
import
numpy
as
np
import
torch
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
TensorDataset
)
from
torch.utils.data.distributed
import
DistributedSampler
from
tqdm
import
tqdm
,
trange
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
scipy.stats
import
pearsonr
,
spearmanr
from
sklearn.metrics
import
matthews_corrcoef
,
f1_score
from
pytorch_pretrained_bert.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
from
pytorch_pretrained_bert.modeling
import
WEIGHTS_NAME
,
CONFIG_NAME
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.optimization
import
BertAdam
,
warmup_linear
from
turing.loss
import
FocalLoss
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
checkpoint_model
(
PATH
,
ckpt_id
,
model
,
epoch
,
last_global_step
,
last_global_data_samples
,
**
kwargs
):
"""Utility function for checkpointing model + optimizer dictionaries
The main purpose for this is to be able to resume training from that instant again
"""
checkpoint_state_dict
=
{
'epoch'
:
epoch
,
'last_global_step'
:
last_global_step
,
'last_global_data_samples'
:
last_global_data_samples
}
# Add extra kwargs too
checkpoint_state_dict
.
update
(
kwargs
)
#success = model.network.save_checkpoint(PATH, ckpt_id,
success
=
model
.
save_checkpoint
(
PATH
,
ckpt_id
,
checkpoint_state_dict
)
status_msg
=
'checkpointing: PATH={}, ckpt_id={}'
.
format
(
PATH
,
ckpt_id
)
if
success
:
logging
.
info
(
f
"Success
{
status_msg
}
"
)
else
:
logging
.
warning
(
f
"Failure
{
status_msg
}
"
)
return
def
load_checkpoint
(
model
,
PATH
,
ckpt_id
):
"""Utility function for checkpointing model + optimizer dictionaries
The main purpose for this is to be able to resume training from that instant again
"""
model
.
load_checkpoint
(
PATH
,
ckpt_id
)
return
class
InputExample
(
object
):
"""A single training/test example for simple sequence classification."""
def
__init__
(
self
,
guid
,
text_a
,
text_b
=
None
,
label
=
None
):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self
.
guid
=
guid
self
.
text_a
=
text_a
self
.
text_b
=
text_b
self
.
label
=
label
class
InputFeatures
(
object
):
"""A single set of features of data."""
def
__init__
(
self
,
input_ids
,
input_mask
,
segment_ids
,
label_id
):
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
segment_ids
=
segment_ids
self
.
label_id
=
label_id
class
DataProcessor
(
object
):
"""Base class for data converters for sequence classification data sets."""
def
get_train_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for the train set."""
raise
NotImplementedError
()
def
get_dev_examples
(
self
,
data_dir
):
"""Gets a collection of `InputExample`s for the dev set."""
raise
NotImplementedError
()
def
get_labels
(
self
):
"""Gets the list of labels for this data set."""
raise
NotImplementedError
()
@
classmethod
def
_read_tsv
(
cls
,
input_file
,
quotechar
=
None
):
"""Reads a tab separated value file."""
with
open
(
input_file
,
"r"
,
encoding
=
'utf-8'
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
lines
=
[]
for
line
in
reader
:
if
sys
.
version_info
[
0
]
==
2
:
line
=
list
(
unicode
(
cell
,
'utf-8'
)
for
cell
in
line
)
lines
.
append
(
line
)
return
lines
class
MrpcProcessor
(
DataProcessor
):
"""Processor for the MRPC data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
logger
.
info
(
"LOOKING AT {}"
.
format
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)))
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
line
[
3
]
text_b
=
line
[
4
]
label
=
line
[
0
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
MnliProcessor
(
DataProcessor
):
"""Processor for the MultiNLI data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev_matched.tsv"
)),
"dev_matched"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"contradiction"
,
"entailment"
,
"neutral"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
8
]
text_b
=
line
[
9
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
MnliMismatchedProcessor
(
MnliProcessor
):
"""Processor for the MultiNLI Mismatched data set (GLUE version)."""
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev_mismatched.tsv"
)),
"dev_matched"
)
class
ColaProcessor
(
DataProcessor
):
"""Processor for the CoLA data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
line
[
3
]
label
=
line
[
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
None
,
label
=
label
))
return
examples
class
Sst2Processor
(
DataProcessor
):
"""Processor for the SST-2 data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
line
[
0
]
label
=
line
[
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
None
,
label
=
label
))
return
examples
class
StsbProcessor
(
DataProcessor
):
"""Processor for the STS-B data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
None
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
7
]
text_b
=
line
[
8
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
QqpProcessor
(
DataProcessor
):
"""Processor for the STS-B data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
try
:
text_a
=
line
[
3
]
text_b
=
line
[
4
]
label
=
line
[
5
]
except
IndexError
:
continue
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
QnliProcessor
(
DataProcessor
):
"""Processor for the STS-B data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev_matched"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"entailment"
,
"not_entailment"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
1
]
text_b
=
line
[
2
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
RteProcessor
(
DataProcessor
):
"""Processor for the RTE data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"entailment"
,
"not_entailment"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
1
]
text_b
=
line
[
2
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
class
WnliProcessor
(
DataProcessor
):
"""Processor for the WNLI data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
if
i
==
0
:
continue
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
1
]
text_b
=
line
[
2
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
))
return
examples
def
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
,
output_mode
):
"""Loads a data file into a list of `InputBatch`s."""
label_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
features
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
if
ex_index
%
10000
==
0
:
logger
.
info
(
"Writing example %d of %d"
%
(
ex_index
,
len
(
examples
)))
tokens_a
=
tokenizer
.
tokenize
(
example
.
text_a
)
tokens_b
=
None
if
example
.
text_b
:
tokens_b
=
tokenizer
.
tokenize
(
example
.
text_b
)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_seq_length
-
3
)
else
:
# Account for [CLS] and [SEP] with "- 2"
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
tokens_a
=
tokens_a
[:(
max_seq_length
-
2
)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens
=
[
"[CLS]"
]
+
tokens_a
+
[
"[SEP]"
]
segment_ids
=
[
0
]
*
len
(
tokens
)
if
tokens_b
:
tokens
+=
tokens_b
+
[
"[SEP]"
]
segment_ids
+=
[
1
]
*
(
len
(
tokens_b
)
+
1
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask
=
[
1
]
*
len
(
input_ids
)
# Zero-pad up to the sequence length.
padding
=
[
0
]
*
(
max_seq_length
-
len
(
input_ids
))
input_ids
+=
padding
input_mask
+=
padding
segment_ids
+=
padding
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
if
output_mode
==
"classification"
:
label_id
=
label_map
[
example
.
label
]
elif
output_mode
==
"regression"
:
label_id
=
float
(
example
.
label
)
else
:
raise
KeyError
(
output_mode
)
if
ex_index
<
5
:
logger
.
info
(
"*** Example ***"
)
logger
.
info
(
"guid: %s"
%
(
example
.
guid
))
logger
.
info
(
"tokens: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
tokens
]))
logger
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logger
.
info
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logger
.
info
(
"segment_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
segment_ids
]))
logger
.
info
(
"label: %s (id = %d)"
%
(
example
.
label
,
label_id
))
features
.
append
(
InputFeatures
(
input_ids
=
input_ids
,
input_mask
=
input_mask
,
segment_ids
=
segment_ids
,
label_id
=
label_id
))
return
features
def
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_length
):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_length
:
break
if
len
(
tokens_a
)
>
len
(
tokens_b
):
tokens_a
.
pop
()
else
:
tokens_b
.
pop
()
def
simple_accuracy
(
preds
,
labels
):
return
(
preds
==
labels
).
mean
()
def
acc_and_f1
(
preds
,
labels
):
acc
=
simple_accuracy
(
preds
,
labels
)
f1
=
f1_score
(
y_true
=
labels
,
y_pred
=
preds
)
return
{
"acc"
:
acc
,
"f1"
:
f1
,
"acc_and_f1"
:
(
acc
+
f1
)
/
2
,
}
def
pearson_and_spearman
(
preds
,
labels
):
pearson_corr
=
pearsonr
(
preds
,
labels
)[
0
]
spearman_corr
=
spearmanr
(
preds
,
labels
)[
0
]
return
{
"pearson"
:
pearson_corr
,
"spearmanr"
:
spearman_corr
,
"corr"
:
(
pearson_corr
+
spearman_corr
)
/
2
,
}
def
compute_metrics
(
task_name
,
preds
,
labels
):
assert
len
(
preds
)
==
len
(
labels
)
if
task_name
==
"cola"
:
return
{
"mcc"
:
matthews_corrcoef
(
labels
,
preds
)}
elif
task_name
==
"sst-2"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"mrpc"
:
return
acc_and_f1
(
preds
,
labels
)
elif
task_name
==
"sts-b"
:
return
pearson_and_spearman
(
preds
,
labels
)
elif
task_name
==
"qqp"
:
return
acc_and_f1
(
preds
,
labels
)
elif
task_name
==
"mnli"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"mnli-mm"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"qnli"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"rte"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
elif
task_name
==
"wnli"
:
return
{
"acc"
:
simple_accuracy
(
preds
,
labels
)}
else
:
raise
KeyError
(
task_name
)
def
main
():
parser
=
argparse
.
ArgumentParser
()
# Required parameters
parser
.
add_argument
(
"--data_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .tsv files (or other data files) for the task."
)
parser
.
add_argument
(
"--bert_model"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese."
)
parser
.
add_argument
(
"--task_name"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The name of the task to train."
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model predictions and checkpoints will be written."
)
# Other parameters
parser
.
add_argument
(
"--cache_dir"
,
default
=
""
,
type
=
str
,
help
=
"Where do you want to store the pre-trained models downloaded from s3"
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization.
\n
"
"Sequences longer than this will be truncated, and sequences shorter
\n
"
"than this will be padded."
)
parser
.
add_argument
(
"--do_train"
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--warmup_proportion"
,
default
=
0.1
,
type
=
float
,
help
=
"Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training."
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Whether not to use CUDA when available"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit float precision instead of 32-bit"
)
parser
.
add_argument
(
'--loss_scale'
,
type
=
float
,
default
=
0
,
help
=
"Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
\n
"
"0 (default value): dynamic loss scaling.
\n
"
"Positive power of 2: static loss scaling value.
\n
"
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
"--model_file"
,
type
=
str
,
default
=
"0"
,
help
=
"Path to the Pretrained BERT Encoder File."
)
parser
.
add_argument
(
'--random'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to fientune for random initialization"
)
parser
.
add_argument
(
'--focal'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to use Focal Loss for finetuning."
)
parser
.
add_argument
(
'--gamma'
,
type
=
float
,
default
=
0.5
,
help
=
"Gamma parameter to be used in focal loss."
)
parser
.
add_argument
(
'--deepspeed_sparse_attention'
,
default
=
False
,
action
=
'store_true'
,
help
=
'Use DeepSpeed sparse self attention.'
)
parser
.
add_argument
(
'--preln'
,
action
=
'store_true'
,
default
=
False
,
help
=
"Switching to the variant of Transformer blocks that use pre-LayerNorm."
)
parser
.
add_argument
(
'--deepspeed_transformer_kernel'
,
default
=
False
,
action
=
'store_true'
,
help
=
'Use DeepSpeed transformer kernel to accelerate.'
)
parser
.
add_argument
(
'--progressive_layer_drop'
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to enable progressive layer dropping or not"
)
parser
=
deepspeed
.
add_config_arguments
(
parser
)
args
=
parser
.
parse_args
()
if
args
.
server_ip
and
args
.
server_port
:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import
ptvsd
print
(
"Waiting for debugger attach"
)
ptvsd
.
enable_attach
(
address
=
(
args
.
server_ip
,
args
.
server_port
),
redirect_output
=
True
)
ptvsd
.
wait_for_attach
()
processors
=
{
"cola"
:
ColaProcessor
,
"mnli"
:
MnliProcessor
,
"mnli-mm"
:
MnliMismatchedProcessor
,
"mrpc"
:
MrpcProcessor
,
"sst-2"
:
Sst2Processor
,
"sts-b"
:
StsbProcessor
,
"qqp"
:
QqpProcessor
,
"qnli"
:
QnliProcessor
,
"rte"
:
RteProcessor
,
"wnli"
:
WnliProcessor
,
}
output_modes
=
{
"cola"
:
"classification"
,
"mnli"
:
"classification"
,
"mrpc"
:
"classification"
,
"sst-2"
:
"classification"
,
"sts-b"
:
"regression"
,
"qqp"
:
"classification"
,
"qnli"
:
"classification"
,
"rte"
:
"classification"
,
"wnli"
:
"classification"
,
}
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
logger
.
info
(
"device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}"
.
format
(
device
,
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
))
if
args
.
gradient_accumulation_steps
<
1
:
raise
ValueError
(
"Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
.
format
(
args
.
gradient_accumulation_steps
))
args
.
train_batch_size
=
args
.
train_batch_size
//
args
.
gradient_accumulation_steps
args
.
seed
=
random
.
randint
(
1
,
1000
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
if
n_gpu
>
0
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
if
not
args
.
do_train
and
not
args
.
do_eval
:
raise
ValueError
(
"At least one of `do_train` or `do_eval` must be True."
)
if
(
torch
.
distributed
.
get_rank
()
==
0
):
if
not
os
.
path
.
exists
(
args
.
output_dir
):
os
.
makedirs
(
args
.
output_dir
)
torch
.
distributed
.
barrier
()
task_name
=
args
.
task_name
.
lower
()
if
task_name
not
in
processors
:
raise
ValueError
(
"Task not found: %s"
%
(
task_name
))
processor
=
processors
[
task_name
]()
output_mode
=
output_modes
[
task_name
]
label_list
=
processor
.
get_labels
()
num_labels
=
len
(
label_list
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
bert_model
,
do_lower_case
=
args
.
do_lower_case
)
train_examples
=
None
num_train_optimization_steps
=
None
if
args
.
do_train
:
train_examples
=
processor
.
get_train_examples
(
args
.
data_dir
)
num_train_optimization_steps
=
int
(
len
(
train_examples
)
/
args
.
train_batch_size
/
args
.
gradient_accumulation_steps
)
*
args
.
num_train_epochs
if
args
.
local_rank
!=
-
1
:
num_train_optimization_steps
=
num_train_optimization_steps
//
torch
.
distributed
.
get_world_size
(
)
# Prepare model
cache_dir
=
args
.
cache_dir
if
args
.
cache_dir
else
os
.
path
.
join
(
str
(
PYTORCH_PRETRAINED_BERT_CACHE
),
'distributed_{}'
.
format
(
args
.
local_rank
))
bert_base_model_config
=
{
"vocab_size_or_config_json_file"
:
119547
,
"hidden_size"
:
1024
,
"num_hidden_layers"
:
24
,
"num_attention_heads"
:
16
,
"intermediate_size"
:
4096
,
"hidden_act"
:
"gelu"
,
"hidden_dropout_prob"
:
0.1
,
"attention_probs_dropout_prob"
:
0.1
,
"max_position_embeddings"
:
512
,
"type_vocab_size"
:
2
,
"initializer_range"
:
0.02
}
if
args
.
progressive_layer_drop
:
print
(
"BertBaseConfigPreLnLayerDrop"
)
from
nvidia.modelingpreln_layerdrop
import
BertForSequenceClassification
,
BertConfig
,
BertLayer
elif
args
.
preln
:
from
nvidia.modelingpreln
import
BertForSequenceClassification
,
BertConfig
,
BertLayer
else
:
from
nvidia.modeling
import
BertForSequenceClassification
,
BertConfig
,
BertLayer
bert_config
=
BertConfig
(
**
bert_base_model_config
)
bert_config
.
vocab_size
=
len
(
tokenizer
.
vocab
)
# Padding for divisibility by 8
if
bert_config
.
vocab_size
%
8
!=
0
:
bert_config
.
vocab_size
+=
8
-
(
bert_config
.
vocab_size
%
8
)
model
=
BertForSequenceClassification
(
args
,
bert_config
,
num_labels
=
num_labels
)
if
args
.
model_file
is
not
"0"
:
logger
.
info
(
f
"Loading Pretrained Bert Encoder from:
{
args
.
model_file
}
"
)
checkpoint_state_dict
=
torch
.
load
(
args
.
model_file
,
map_location
=
torch
.
device
(
"cpu"
))
if
'module'
in
checkpoint_state_dict
:
logger
.
info
(
'Loading DeepSpeed v2.0 style checkpoint'
)
model
.
load_state_dict
(
checkpoint_state_dict
[
'module'
],
strict
=
False
)
elif
'model_state_dict'
in
checkpoint_state_dict
:
model
.
load_state_dict
(
checkpoint_state_dict
[
'model_state_dict'
],
strict
=
False
)
else
:
raise
ValueError
(
"Unable to find model state in checkpoint"
)
logger
.
info
(
f
"Pretrained Bert Encoder Loaded from:
{
args
.
model_file
}
"
)
if
args
.
random
:
logger
.
info
(
"USING RANDOM INITIALISATION FOR FINETUNING"
)
model
.
apply
(
model
.
init_bert_weights
)
if
args
.
fp16
:
model
.
half
()
model
.
to
(
device
)
if
args
.
local_rank
!=
-
1
:
try
:
if
args
.
deepscale
:
print
(
"Enabling DeepScale"
)
from
deepscale.distributed_apex
import
DistributedDataParallel
as
DDP
else
:
from
apex.parallel
import
DistributedDataParallel
as
DDP
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
model
=
DDP
(
model
)
elif
n_gpu
>
1
:
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}]
model
,
optimizer
,
_
,
_
=
deepspeed
.
initialize
(
args
=
args
,
model
=
model
,
model_parameters
=
optimizer_grouped_parameters
,
dist_init_required
=
True
)
global_step
=
0
nb_tr_steps
=
0
tr_loss
=
0
if
args
.
do_train
:
train_features
=
convert_examples_to_features
(
train_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
logger
.
info
(
" Num steps = %d"
,
num_train_optimization_steps
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
train_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
train_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
if
args
.
fp16
:
all_label_ids
=
torch
.
tensor
(
[
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
half
)
else
:
all_label_ids
=
torch
.
tensor
(
[
f
.
label_id
for
f
in
train_features
],
dtype
=
torch
.
float
)
train_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
if
args
.
local_rank
==
-
1
:
train_sampler
=
RandomSampler
(
train_data
)
else
:
train_sampler
=
DistributedSampler
(
train_data
)
train_dataloader
=
DataLoader
(
train_data
,
sampler
=
train_sampler
,
batch_size
=
args
.
train_batch_size
)
model
.
train
()
nb_tr_examples
=
0
for
_
in
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
):
tr_loss
=
0
nb_tr_examples
,
nb_tr_steps
=
0
,
0
for
step
,
batch
in
enumerate
(
tqdm
(
train_dataloader
,
desc
=
"Iteration"
)):
batch
=
tuple
(
t
.
to
(
device
)
for
t
in
batch
)
input_ids
,
input_mask
,
segment_ids
,
label_ids
=
batch
# define a new function to compute loss values for both output_modes
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
,
labels
=
None
)
if
output_mode
==
"classification"
:
if
args
.
focal
:
loss_fct
=
FocalLoss
(
class_num
=
num_labels
,
gamma
=
args
.
gamma
)
else
:
loss_fct
=
CrossEntropyLoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
elif
output_mode
==
"regression"
:
loss_fct
=
MSELoss
()
loss
=
loss_fct
(
logits
.
view
(
-
1
),
label_ids
.
view
(
-
1
))
if
n_gpu
>
1
:
loss
=
loss
.
mean
()
# mean() to average on multi-gpu.
if
args
.
gradient_accumulation_steps
>
1
:
loss
=
loss
/
args
.
gradient_accumulation_steps
if
args
.
deepscale
and
args
.
local_rank
!=
-
1
:
model
.
disable_need_reduction
()
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
model
.
enable_need_reduction
()
if
args
.
fp16
:
optimizer
.
backward
(
loss
)
else
:
loss
.
backward
()
tr_loss
+=
loss
.
item
()
nb_tr_examples
+=
input_ids
.
size
(
0
)
nb_tr_steps
+=
1
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
\
warmup_linear
(
global_step
/
num_train_optimization_steps
,
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
zero_grad
()
global_step
+=
1
saved_path
=
os
.
path
.
join
(
args
.
output_dir
,
"finetuned_quantized_checkpoints"
)
checkpoint_model
(
PATH
=
saved_path
,
ckpt_id
=
'epoch{}_step{}'
.
format
(
args
.
num_train_epochs
,
global_step
),
model
=
model
,
epoch
=
args
.
num_train_epochs
,
last_global_step
=
global_step
,
last_global_data_samples
=
nb_tr_examples
*
torch
.
distributed
.
get_world_size
())
if
args
.
do_eval
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
if
output_mode
==
"classification"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
elif
output_mode
==
"regression"
:
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
float
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
,
labels
=
None
)
# create eval loss and other metric required by the task
if
output_mode
==
"classification"
:
if
args
.
focal
:
loss_fct
=
FocalLoss
(
class_num
=
num_labels
,
gamma
=
args
.
gamma
)
else
:
loss_fct
=
CrossEntropyLoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
elif
output_mode
==
"regression"
:
loss_fct
=
MSELoss
()
print
(
logits
.
type
())
print
(
label_ids
.
type
())
if
task_name
==
"sts-b"
:
tmp_eval_loss
=
loss_fct
(
logits
.
float
().
view
(
-
1
),
label_ids
.
view
(
-
1
))
else
:
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
),
label_ids
.
view
(
-
1
))
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
if
output_mode
==
"classification"
:
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
elif
output_mode
==
"regression"
:
preds
=
np
.
squeeze
(
preds
)
result
=
compute_metrics
(
task_name
,
preds
,
all_label_ids
.
numpy
())
loss
=
tr_loss
/
nb_tr_steps
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
# hack for MNLI-MM
if
task_name
==
"mnli"
:
task_name
=
"mnli-mm"
processor
=
processors
[
task_name
]()
if
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
)
and
os
.
listdir
(
args
.
output_dir
+
'-MM'
)
and
args
.
do_train
:
raise
ValueError
(
"Output directory ({}{}) already exists and is not empty."
.
format
(
args
.
output_dir
,
'-MM'
))
if
not
os
.
path
.
exists
(
args
.
output_dir
+
'-MM'
):
os
.
makedirs
(
args
.
output_dir
+
'-MM'
)
eval_examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
eval_features
=
convert_examples_to_features
(
eval_examples
,
label_list
,
args
.
max_seq_length
,
tokenizer
,
output_mode
)
logger
.
info
(
"***** Running evaluation *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
eval_examples
))
logger
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
(
[
f
.
input_mask
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
(
[
f
.
segment_ids
for
f
in
eval_features
],
dtype
=
torch
.
long
)
all_label_ids
=
torch
.
tensor
([
f
.
label_id
for
f
in
eval_features
],
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_label_ids
)
# Run prediction for full data
eval_sampler
=
SequentialSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
model
.
eval
()
eval_loss
=
0
nb_eval_steps
=
0
preds
=
[]
for
input_ids
,
input_mask
,
segment_ids
,
label_ids
in
tqdm
(
eval_dataloader
,
desc
=
"Evaluating"
):
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
label_ids
=
label_ids
.
to
(
device
)
with
torch
.
no_grad
():
logits
=
model
(
input_ids
,
segment_ids
,
input_mask
,
labels
=
None
)
if
args
.
focal
:
loss_fct
=
FocalLoss
(
class_num
=
num_labels
,
gamma
=
args
.
gamma
)
else
:
loss_fct
=
CrossEntropyLoss
()
tmp_eval_loss
=
loss_fct
(
logits
.
view
(
-
1
,
num_labels
),
label_ids
.
view
(
-
1
))
eval_loss
+=
tmp_eval_loss
.
mean
().
item
()
nb_eval_steps
+=
1
if
len
(
preds
)
==
0
:
preds
.
append
(
logits
.
detach
().
cpu
().
numpy
())
else
:
preds
[
0
]
=
np
.
append
(
preds
[
0
],
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
preds
=
preds
[
0
]
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
result
=
compute_metrics
(
task_name
,
preds
,
all_label_ids
.
numpy
())
loss
=
tr_loss
/
nb_tr_steps
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'loss'
]
=
loss
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
+
'-MM'
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
logger
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
logger
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"__main__"
:
main
()
Deepspeed/BingBertGlue/turing/dataset.py
0 → 100644
View file @
316d3f90
# __AUTHOR__ : SAKSHAM SINGHAL
# __EMAIL__ : SAKSINGH@MICROSOFT.COM
import
torch
import
os
from
torch.utils.data
import
DataLoader
,
Dataset
from
enum
import
IntEnum
from
random
import
choice
import
random
import
collections
import
time
from
turing.utils
import
namedtorchbatch
from
turing.text
import
mask
,
torch_long
,
PAD
from
turing.sources
import
QueryPassageDataset
,
QueryInstanceDataset
,
\
PretrainingDataCreator
,
TokenInstance
,
QueryPassageFineTuningDataset
,
\
WikiNBookCorpusPretrainingDataCreator
,
CleanBodyDataCreator
,
\
NumpyPretrainingDataCreator
from
turing.sources
import
WikiPretrainingDataCreator
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
class
BatchType
(
IntEnum
):
RANKING_BATCH
=
0
QP_BATCH
=
1
PRETRAIN_BATCH
=
2
class
PretrainDataType
(
IntEnum
):
NUMPY
=
0
VALIDATION
=
1
MaskedLMInstance
=
collections
.
namedtuple
(
"MaskedLMInstance"
,
[
"index"
,
"label"
])
QABatch
=
collections
.
namedtuple
(
'QABatch'
,
[
'input_ids'
,
'input_mask'
,
'sequence_ids'
,
'label'
])
RankingBatch
=
collections
.
namedtuple
(
'RankingBatch'
,
[
'input_ids'
,
'input_mask'
,
'sequence_ids'
,
'label'
])
PretrainBatch
=
collections
.
namedtuple
(
'PreTrainBatch'
,
[
'input_ids'
,
'input_mask'
,
'sequence_ids'
,
'is_next_label'
,
'masked_lm_output'
])
class
BertJobType
(
IntEnum
):
"""Enumerates the various tasks that we will be running
"""
QA_TASK
=
0
# This is Q-P pair prediction
MLM
=
1
# Masking LM for captions data
NSP
=
1
# Next Sentence Prediction task
def
get_random_partition
(
data_directory
,
index
):
partitions
=
[
os
.
path
.
join
(
data_directory
,
x
)
for
x
in
os
.
listdir
(
data_directory
)
]
partitions
=
sorted
(
partitions
)
i
=
index
%
len
(
partitions
)
return
partitions
[
i
]
def
map_to_torch
(
encoding
):
encoding
=
torch_long
(
encoding
)
encoding
.
requires_grad_
(
False
)
return
encoding
def
map_to_torch_float
(
encoding
):
encoding
=
torch
.
FloatTensor
(
encoding
)
encoding
.
requires_grad_
(
False
)
return
encoding
def
map_to_torch_half
(
encoding
):
encoding
=
torch
.
HalfTensor
(
encoding
)
encoding
.
requires_grad_
(
False
)
return
encoding
def
encode_sequence
(
seqA
,
seqB
,
max_seq_len
,
tokenizer
):
seqA
=
[
"[CLS]"
]
+
seqA
+
[
"[SEP]"
]
seqB
=
seqB
+
[
"[SEP]"
]
input_tokens
=
seqA
+
seqB
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
input_tokens
)
sequence_ids
=
[
0
]
*
len
(
seqA
)
+
[
1
]
*
len
(
seqB
)
input_mask
=
[
1
]
*
len
(
input_ids
)
while
len
(
input_ids
)
<
max_seq_len
:
input_ids
.
append
(
PAD
)
sequence_ids
.
append
(
PAD
)
input_mask
.
append
(
PAD
)
return
(
map_to_torch
(
input_ids
),
map_to_torch
(
input_mask
),
map_to_torch
(
sequence_ids
))
def
truncate_input_sequence
(
tokens_a
,
tokens_b
,
max_num_tokens
):
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_num_tokens
:
break
trunc_tokens
=
tokens_a
if
len
(
tokens_a
)
>
len
(
tokens_b
)
else
tokens_b
assert
len
(
trunc_tokens
)
>=
1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if
random
.
random
()
<
0.5
:
del
trunc_tokens
[
0
]
else
:
trunc_tokens
.
pop
()
class
QADataset
(
Dataset
):
def
__init__
(
self
,
tokenizer
:
BertTokenizer
,
folder
:
str
,
logger
,
max_seq_len
,
index
):
self
.
tokenizer
=
tokenizer
self
.
dir_path
=
folder
self
.
max_seq_len
=
max_seq_len
self
.
len
=
0
path
=
get_random_partition
(
self
.
dir_path
,
index
)
logger
.
info
(
f
"Loading Query-Passage Pairs from
{
path
}
"
)
self
.
data
=
QueryPassageDataset
(
path
)
self
.
len
=
len
(
self
.
data
)
logger
.
info
(
f
"Data Loading Completed for Query-Passage Pairs from
{
path
}
with
{
self
.
len
}
samples."
)
def
__len__
(
self
):
return
self
.
len
def
__getitem__
(
self
,
index
):
i
=
index
%
self
.
len
query
,
passage
,
label
=
self
.
data
.
all_pairs
[
i
]
label
=
float
(
label
)
# sample_choice = choice([0, 1])
# if sample_choice == 0: # generate negative sample
# query, passage = self.data.all_pairs[i][0], self.data.all_pairs[i+1][1]
# else: # generate positive sample
# query, passage = self.data.all_pairs[i]
query_tokens
=
self
.
tokenizer
.
tokenize
(
query
)
passage_tokens
=
self
.
tokenizer
.
tokenize
(
passage
)
if
(
len
(
query_tokens
)
>
self
.
max_seq_len
//
2
):
query_tokens
=
query_tokens
[
0
:
self
.
max_seq_len
//
2
]
max_passage_tokens
=
self
.
max_seq_len
-
\
len
(
query_tokens
)
-
3
# Removing 3 for SEP and CLS
if
(
len
(
passage_tokens
)
>
max_passage_tokens
):
passage_tokens
=
passage_tokens
[
0
:
max_passage_tokens
]
input_ids
,
input_mask
,
sequence_ids
=
encode_sequence
(
query_tokens
,
passage_tokens
,
self
.
max_seq_len
,
self
.
tokenizer
)
return
tuple
([
map_to_torch
([
BatchType
.
QP_BATCH
]),
input_ids
,
input_mask
,
sequence_ids
,
map_to_torch_float
([
label
])
])
# return QABatch(input_ids=input_ids, input_mask=input_mask, sequence_ids=sequence_ids, label=map_to_torch([label]))
class
QAFinetuningDataset
(
QADataset
):
def
__init__
(
self
,
tokenizer
:
BertTokenizer
,
file_path
,
logger
,
max_seq_len
):
self
.
tokenizer
=
tokenizer
self
.
path
=
file_path
self
.
max_seq_len
=
max_seq_len
self
.
len
=
0
logger
.
info
(
f
"Loading Query-Passage Pairs from
{
self
.
path
}
"
)
self
.
data
=
QueryPassageFineTuningDataset
(
self
.
path
)
self
.
len
=
len
(
self
.
data
)
logger
.
info
(
f
"Data Loading Completed for Finetuning Query-Passage Pairs from
{
self
.
path
}
with
{
self
.
len
}
samples."
)
class
RankingDataset
(
Dataset
):
def
__init__
(
self
,
tokenizer
:
BertTokenizer
,
folder
:
str
,
logger
,
max_seq_len
,
index
,
fp16
=
False
):
self
.
tokenizer
=
tokenizer
self
.
dir_path
=
folder
self
.
max_seq_len
=
max_seq_len
self
.
len
=
0
self
.
fp16
=
fp16
path
=
get_random_partition
(
self
.
dir_path
,
index
)
logger
.
info
(
f
"Loading Query-Instance Pairs from
{
path
}
"
)
self
.
data
=
QueryInstanceDataset
(
path
)
self
.
len
=
len
(
self
.
data
)
logger
.
info
(
f
"Data Loading Completed for Query-Instance Pairs from
{
path
}
with
{
self
.
len
}
samples."
)
def
__len__
(
self
):
return
self
.
len
def
__getitem__
(
self
,
index
):
i
=
index
%
self
.
len
query
,
instance
,
label
=
self
.
data
.
all_pairs
[
i
]
label
=
float
(
label
)
instances
=
instance
.
split
(
'<sep>'
)
query_tokens
=
self
.
tokenizer
.
tokenize
(
query
)
instances
=
[
self
.
tokenizer
.
tokenize
(
x
)
for
x
in
instances
]
instance_tokens
=
[]
for
x
in
instances
:
instance_tokens
.
extend
(
x
)
instance_tokens
.
append
(
'[SEP]'
)
instance_tokens
=
instance_tokens
[:
-
1
]
# instance_tokens = self.tokenizer.tokenize(instance)
if
(
len
(
query_tokens
)
>
self
.
max_seq_len
//
2
):
query_tokens
=
query_tokens
[
0
:
self
.
max_seq_len
//
2
]
max_instance_tokens
=
self
.
max_seq_len
-
\
len
(
query_tokens
)
-
3
# Removing 3 for SEP and CLS
if
(
len
(
instance_tokens
)
>
max_instance_tokens
):
instance_tokens
=
instance_tokens
[
0
:
max_instance_tokens
]
input_ids
,
input_mask
,
sequence_ids
=
encode_sequence
(
query_tokens
,
instance_tokens
,
self
.
max_seq_len
,
self
.
tokenizer
)
return
tuple
([
map_to_torch
([
BatchType
.
RANKING_BATCH
]),
input_ids
,
input_mask
,
sequence_ids
,
map_to_torch_float
([
label
])
])
class
PreTrainingDataset
(
Dataset
):
def
__init__
(
self
,
tokenizer
:
BertTokenizer
,
folder
:
str
,
logger
,
max_seq_length
,
index
,
data_type
:
PretrainDataType
=
PretrainDataType
.
NUMPY
,
max_predictions_per_seq
:
int
=
20
):
self
.
tokenizer
=
tokenizer
self
.
dir_path
=
folder
self
.
max_seq_length
=
max_seq_length
self
.
len
=
0
self
.
masked_lm_prob
=
0.15
self
.
max_predictions_per_seq
=
max_predictions_per_seq
self
.
vocab_words
=
list
(
tokenizer
.
vocab
.
keys
())
path
=
get_random_partition
(
self
.
dir_path
,
index
)
logger
.
info
(
f
"Loading Pretraining Data from
{
path
}
"
)
start
=
time
.
time
()
# logger.info(f"Loading Pretraining Data from {path}")
# if data_type == PretrainDataType.CLEAN_BODY:
# self.data = CleanBodyDataCreator.load(path)
# elif data_type == PretrainDataType.WIKIPEDIA or data_type == PretrainDataType.BOOK_CORPUS:
# self.data = WikiNBookCorpusPretrainingDataCreator.load(path)
if
data_type
==
PretrainDataType
.
VALIDATION
:
self
.
data
=
WikiPretrainingDataCreator
.
load
(
path
)
elif
data_type
==
PretrainDataType
.
NUMPY
:
self
.
data
=
NumpyPretrainingDataCreator
.
load
(
path
)
self
.
len
=
len
(
self
.
data
)
logger
.
info
(
f
"Data Loading Completed for Pretraining Data from
{
path
}
with
{
self
.
len
}
samples took
{
time
.
time
()
-
start
:.
2
f
}
s."
)
self
.
len
=
len
(
self
.
data
)
logger
.
info
(
f
"Data Loading Completed for Pretraining Data from
{
path
}
with
{
self
.
len
}
samples."
)
def
__len__
(
self
):
return
self
.
len
def
__getitem__
(
self
,
index
):
i
=
index
%
self
.
len
instance
:
TokenInstance
=
self
.
data
.
instances
[
i
]
return
self
.
create_training_instance
(
instance
)
def
create_training_instance
(
self
,
instance
:
TokenInstance
):
tokens_a
,
tokens_b
,
is_next
=
instance
.
get_values
()
# print(f'is_next label:{is_next}')
# Create mapper
tokens
=
[]
segment_ids
=
[]
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_b
:
tokens
.
append
(
token
)
segment_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
# Get Masked LM predictions
tokens
,
masked_lm_output
=
self
.
create_masked_lm_predictions
(
tokens
)
# Convert to Ids
input_ids
=
self
.
tokenizer
.
convert_tokens_to_ids
(
tokens
)
input_mask
=
[
1
]
*
len
(
input_ids
)
while
len
(
input_ids
)
<
self
.
max_seq_length
:
input_ids
.
append
(
PAD
)
segment_ids
.
append
(
PAD
)
input_mask
.
append
(
PAD
)
masked_lm_output
.
append
(
-
1
)
return
([
map_to_torch
([
BatchType
.
PRETRAIN_BATCH
]),
map_to_torch
(
input_ids
),
map_to_torch
(
input_mask
),
map_to_torch
(
segment_ids
),
map_to_torch
([
is_next
]),
map_to_torch
(
masked_lm_output
)
])
def
create_masked_lm_predictions
(
self
,
tokens
):
cand_indexes
=
[]
for
i
,
token
in
enumerate
(
tokens
):
if
token
==
"[CLS]"
or
token
==
"[SEP]"
:
continue
cand_indexes
.
append
(
i
)
random
.
shuffle
(
cand_indexes
)
output_tokens
=
list
(
tokens
)
num_to_predict
=
min
(
self
.
max_predictions_per_seq
,
max
(
1
,
int
(
round
(
len
(
tokens
)
*
self
.
masked_lm_prob
))))
masked_lms
=
[]
covered_indexes
=
set
()
for
index
in
cand_indexes
:
if
len
(
masked_lms
)
>=
num_to_predict
:
break
if
index
in
covered_indexes
:
continue
covered_indexes
.
add
(
index
)
masked_token
=
None
# 80% mask
if
random
.
random
()
<
0.8
:
masked_token
=
"[MASK]"
else
:
# 10% Keep Original
if
random
.
random
()
<
0.5
:
masked_token
=
tokens
[
index
]
# 10% replace w/ random word
else
:
masked_token
=
self
.
vocab_words
[
random
.
randint
(
0
,
len
(
self
.
vocab_words
)
-
1
)]
output_tokens
[
index
]
=
masked_token
masked_lms
.
append
(
MaskedLMInstance
(
index
=
index
,
label
=
tokens
[
index
]))
masked_lms
=
sorted
(
masked_lms
,
key
=
lambda
x
:
x
.
index
)
masked_lm_output
=
[
-
1
]
*
len
(
output_tokens
)
for
p
in
masked_lms
:
masked_lm_output
[
p
.
index
]
=
self
.
tokenizer
.
vocab
[
p
.
label
]
return
(
output_tokens
,
masked_lm_output
)
Prev
1
2
3
4
5
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment