Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
35e6baab
Unverified
Commit
35e6baab
authored
Jun 14, 2019
by
Thomas Wolf
Committed by
GitHub
Jun 14, 2019
Browse files
Merge branch 'master' into attention
parents
5e1207b8
f9cde97b
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
790 additions
and
375 deletions
+790
-375
examples/lm_finetuning/finetune_on_pregenerated.py
examples/lm_finetuning/finetune_on_pregenerated.py
+1
-2
examples/lm_finetuning/pregenerate_training_data.py
examples/lm_finetuning/pregenerate_training_data.py
+59
-23
examples/lm_finetuning/simple_lm_finetuning.py
examples/lm_finetuning/simple_lm_finetuning.py
+30
-30
examples/run_classifier.py
examples/run_classifier.py
+33
-33
examples/run_openai_gpt.py
examples/run_openai_gpt.py
+16
-15
examples/run_squad.py
examples/run_squad.py
+35
-35
examples/run_swag.py
examples/run_swag.py
+35
-35
hubconf.py
hubconf.py
+17
-185
hubconfs/bert_hubconf.py
hubconfs/bert_hubconf.py
+348
-0
hubconfs/gpt_hubconf.py
hubconfs/gpt_hubconf.py
+183
-0
pytorch_pretrained_bert/file_utils.py
pytorch_pretrained_bert/file_utils.py
+12
-3
pytorch_pretrained_bert/modeling.py
pytorch_pretrained_bert/modeling.py
+11
-8
pytorch_pretrained_bert/modeling_openai.py
pytorch_pretrained_bert/modeling_openai.py
+10
-6
No files found.
examples/lm_finetuning/finetune_on_pregenerated.py
View file @
35e6baab
...
@@ -315,8 +315,7 @@ def main():
...
@@ -315,8 +315,7 @@ def main():
if
args
.
fp16
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
/
num_train_optimization_steps
,
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
,
args
.
warmup_proportion
)
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
step
()
...
...
examples/lm_finetuning/pregenerate_training_data.py
View file @
35e6baab
...
@@ -4,11 +4,11 @@ from tqdm import tqdm, trange
...
@@ -4,11 +4,11 @@ from tqdm import tqdm, trange
from
tempfile
import
TemporaryDirectory
from
tempfile
import
TemporaryDirectory
import
shelve
import
shelve
from
random
import
random
,
randrange
,
randint
,
shuffle
,
choice
,
sample
from
random
import
random
,
randrange
,
randint
,
shuffle
,
choice
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
import
numpy
as
np
import
numpy
as
np
import
json
import
json
import
collections
class
DocumentDatabase
:
class
DocumentDatabase
:
def
__init__
(
self
,
reduce_memory
=
False
):
def
__init__
(
self
,
reduce_memory
=
False
):
...
@@ -98,42 +98,77 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
...
@@ -98,42 +98,77 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
else
:
else
:
trunc_tokens
.
pop
()
trunc_tokens
.
pop
()
MaskedLmInstance
=
collections
.
namedtuple
(
"MaskedLmInstance"
,
[
"index"
,
"label"
])
def
create_masked_lm_predictions
(
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_list
):
def
create_masked_lm_predictions
(
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
whole_word_mask
,
vocab_list
):
"""Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
"""Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
with several refactors to clean it up and remove a lot of unnecessary variables."""
with several refactors to clean it up and remove a lot of unnecessary variables."""
cand_indices
=
[]
cand_indices
=
[]
for
(
i
,
token
)
in
enumerate
(
tokens
):
for
(
i
,
token
)
in
enumerate
(
tokens
):
if
token
==
"[CLS]"
or
token
==
"[SEP]"
:
if
token
==
"[CLS]"
or
token
==
"[SEP]"
:
continue
continue
cand_indices
.
append
(
i
)
# Whole Word Masking means that if we mask all of the wordpieces
# corresponding to an original word. When a word has been split into
# WordPieces, the first token does not have any marker and any subsequence
# tokens are prefixed with ##. So whenever we see the ## token, we
# append it to the previous set of word indexes.
#
# Note that Whole Word Masking does *not* change the training code
# at all -- we still predict each WordPiece independently, softmaxed
# over the entire vocabulary.
if
(
whole_word_mask
and
len
(
cand_indices
)
>=
1
and
token
.
startswith
(
"##"
)):
cand_indices
[
-
1
].
append
(
i
)
else
:
cand_indices
.
append
([
i
])
num_to_mask
=
min
(
max_predictions_per_seq
,
num_to_mask
=
min
(
max_predictions_per_seq
,
max
(
1
,
int
(
round
(
len
(
tokens
)
*
masked_lm_prob
))))
max
(
1
,
int
(
round
(
len
(
tokens
)
*
masked_lm_prob
))))
shuffle
(
cand_indices
)
shuffle
(
cand_indices
)
mask_indices
=
sorted
(
sample
(
cand_indices
,
num_to_mask
))
masked_lms
=
[]
masked_token_labels
=
[]
covered_indexes
=
set
()
for
index
in
mask_indices
:
for
index_set
in
cand_indices
:
# 80% of the time, replace with [MASK]
if
len
(
masked_lms
)
>=
num_to_mask
:
if
random
()
<
0.8
:
break
masked_token
=
"[MASK]"
# If adding a whole-word mask would exceed the maximum number of
else
:
# predictions, then just skip this candidate.
# 10% of the time, keep original
if
len
(
masked_lms
)
+
len
(
index_set
)
>
num_to_mask
:
if
random
()
<
0.5
:
continue
masked_token
=
tokens
[
index
]
is_any_index_covered
=
False
# 10% of the time, replace with random word
for
index
in
index_set
:
if
index
in
covered_indexes
:
is_any_index_covered
=
True
break
if
is_any_index_covered
:
continue
for
index
in
index_set
:
covered_indexes
.
add
(
index
)
masked_token
=
None
# 80% of the time, replace with [MASK]
if
random
()
<
0.8
:
masked_token
=
"[MASK]"
else
:
else
:
masked_token
=
choice
(
vocab_list
)
# 10% of the time, keep original
masked_token_labels
.
append
(
tokens
[
index
])
if
random
()
<
0.5
:
# Once we've saved the true label for that token, we can overwrite it with the masked version
masked_token
=
tokens
[
index
]
tokens
[
index
]
=
masked_token
# 10% of the time, replace with random word
else
:
masked_token
=
choice
(
vocab_list
)
masked_lms
.
append
(
MaskedLmInstance
(
index
=
index
,
label
=
tokens
[
index
]))
tokens
[
index
]
=
masked_token
assert
len
(
masked_lms
)
<=
num_to_mask
masked_lms
=
sorted
(
masked_lms
,
key
=
lambda
x
:
x
.
index
)
mask_indices
=
[
p
.
index
for
p
in
masked_lms
]
masked_token_labels
=
[
p
.
label
for
p
in
masked_lms
]
return
tokens
,
mask_indices
,
masked_token_labels
return
tokens
,
mask_indices
,
masked_token_labels
def
create_instances_from_document
(
def
create_instances_from_document
(
doc_database
,
doc_idx
,
max_seq_length
,
short_seq_prob
,
doc_database
,
doc_idx
,
max_seq_length
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_list
):
masked_lm_prob
,
max_predictions_per_seq
,
whole_word_mask
,
vocab_list
):
"""This code is mostly a duplicate of the equivalent function from Google BERT's repo.
"""This code is mostly a duplicate of the equivalent function from Google BERT's repo.
However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
...
@@ -213,7 +248,7 @@ def create_instances_from_document(
...
@@ -213,7 +248,7 @@ def create_instances_from_document(
segment_ids
=
[
0
for
_
in
range
(
len
(
tokens_a
)
+
2
)]
+
[
1
for
_
in
range
(
len
(
tokens_b
)
+
1
)]
segment_ids
=
[
0
for
_
in
range
(
len
(
tokens_a
)
+
2
)]
+
[
1
for
_
in
range
(
len
(
tokens_b
)
+
1
)]
tokens
,
masked_lm_positions
,
masked_lm_labels
=
create_masked_lm_predictions
(
tokens
,
masked_lm_positions
,
masked_lm_labels
=
create_masked_lm_predictions
(
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_list
)
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
whole_word_mask
,
vocab_list
)
instance
=
{
instance
=
{
"tokens"
:
tokens
,
"tokens"
:
tokens
,
...
@@ -237,7 +272,8 @@ def main():
...
@@ -237,7 +272,8 @@ def main():
choices
=
[
"bert-base-uncased"
,
"bert-large-uncased"
,
"bert-base-cased"
,
choices
=
[
"bert-base-uncased"
,
"bert-large-uncased"
,
"bert-base-cased"
,
"bert-base-multilingual"
,
"bert-base-chinese"
])
"bert-base-multilingual"
,
"bert-base-chinese"
])
parser
.
add_argument
(
"--do_lower_case"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--do_whole_word_mask"
,
action
=
"store_true"
,
help
=
"Whether to use whole word masking rather than per-WordPiece masking."
)
parser
.
add_argument
(
"--reduce_memory"
,
action
=
"store_true"
,
parser
.
add_argument
(
"--reduce_memory"
,
action
=
"store_true"
,
help
=
"Reduce memory usage for large datasets by keeping data on disc rather than in memory"
)
help
=
"Reduce memory usage for large datasets by keeping data on disc rather than in memory"
)
...
@@ -284,7 +320,7 @@ def main():
...
@@ -284,7 +320,7 @@ def main():
doc_instances
=
create_instances_from_document
(
doc_instances
=
create_instances_from_document
(
docs
,
doc_idx
,
max_seq_length
=
args
.
max_seq_len
,
short_seq_prob
=
args
.
short_seq_prob
,
docs
,
doc_idx
,
max_seq_length
=
args
.
max_seq_len
,
short_seq_prob
=
args
.
short_seq_prob
,
masked_lm_prob
=
args
.
masked_lm_prob
,
max_predictions_per_seq
=
args
.
max_predictions_per_seq
,
masked_lm_prob
=
args
.
masked_lm_prob
,
max_predictions_per_seq
=
args
.
max_predictions_per_seq
,
vocab_list
=
vocab_list
)
whole_word_mask
=
args
.
do_whole_word_mask
,
vocab_list
=
vocab_list
)
doc_instances
=
[
json
.
dumps
(
instance
)
for
instance
in
doc_instances
]
doc_instances
=
[
json
.
dumps
(
instance
)
for
instance
in
doc_instances
]
for
instance
in
doc_instances
:
for
instance
in
doc_instances
:
epoch_file
.
write
(
instance
+
'
\n
'
)
epoch_file
.
write
(
instance
+
'
\n
'
)
...
...
examples/lm_finetuning/simple_lm_finetuning.py
View file @
35e6baab
...
@@ -534,36 +534,37 @@ def main():
...
@@ -534,36 +534,37 @@ def main():
model
=
torch
.
nn
.
DataParallel
(
model
)
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
if
args
.
do_train
:
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
param_optimizer
=
list
(
model
.
named_parameters
())
optimizer_grouped_parameters
=
[
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
if
args
.
fp16
:
try
:
if
args
.
fp16
:
from
apex.optimizers
import
FP16_Optimizer
try
:
from
apex.optimizers
import
FusedAdam
from
apex.optimizers
import
FP16_Optimizer
except
ImportError
:
from
apex.optimizers
import
FusedAdam
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
lr
=
args
.
learning_rate
,
t_total
=
num_train_optimization_steps
)
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
else
:
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
global_step
=
0
global_step
=
0
if
args
.
do_train
:
if
args
.
do_train
:
...
@@ -603,8 +604,7 @@ def main():
...
@@ -603,8 +604,7 @@ def main():
if
args
.
fp16
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
/
num_train_optimization_steps
,
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
,
args
.
warmup_proportion
)
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
step
()
...
...
examples/run_classifier.py
View file @
35e6baab
...
@@ -271,7 +271,7 @@ class StsbProcessor(DataProcessor):
...
@@ -271,7 +271,7 @@ class StsbProcessor(DataProcessor):
class
QqpProcessor
(
DataProcessor
):
class
QqpProcessor
(
DataProcessor
):
"""Processor for the
STS-B
data set (GLUE version)."""
"""Processor for the
QQP
data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
"""See base class."""
...
@@ -306,7 +306,7 @@ class QqpProcessor(DataProcessor):
...
@@ -306,7 +306,7 @@ class QqpProcessor(DataProcessor):
class
QnliProcessor
(
DataProcessor
):
class
QnliProcessor
(
DataProcessor
):
"""Processor for the
STS-B
data set (GLUE version)."""
"""Processor for the
QNLI
data set (GLUE version)."""
def
get_train_examples
(
self
,
data_dir
):
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
"""See base class."""
...
@@ -763,35 +763,36 @@ def main():
...
@@ -763,35 +763,36 @@ def main():
model
=
torch
.
nn
.
DataParallel
(
model
)
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
if
args
.
do_train
:
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
param_optimizer
=
list
(
model
.
named_parameters
())
optimizer_grouped_parameters
=
[
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
if
args
.
fp16
:
]
try
:
if
args
.
fp16
:
from
apex.optimizers
import
FP16_Optimizer
try
:
from
apex.optimizers
import
FusedAdam
from
apex.optimizers
import
FP16_Optimizer
except
ImportError
:
from
apex.optimizers
import
FusedAdam
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
bias_correction
=
False
,
max_grad_norm
=
1.0
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
lr
=
args
.
learning_rate
,
t_total
=
num_train_optimization_steps
)
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
else
:
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
global_step
=
0
global_step
=
0
nb_tr_steps
=
0
nb_tr_steps
=
0
...
@@ -854,8 +855,7 @@ def main():
...
@@ -854,8 +855,7 @@ def main():
if
args
.
fp16
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
/
num_train_optimization_steps
,
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
,
args
.
warmup_proportion
)
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
step
()
...
@@ -939,7 +939,7 @@ def main():
...
@@ -939,7 +939,7 @@ def main():
elif
output_mode
==
"regression"
:
elif
output_mode
==
"regression"
:
preds
=
np
.
squeeze
(
preds
)
preds
=
np
.
squeeze
(
preds
)
result
=
compute_metrics
(
task_name
,
preds
,
all_label_ids
.
numpy
())
result
=
compute_metrics
(
task_name
,
preds
,
all_label_ids
.
numpy
())
loss
=
tr_loss
/
nb_tr
_step
s
if
args
.
do_train
else
None
loss
=
tr_loss
/
global
_step
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'global_step'
]
=
global_step
...
@@ -1007,7 +1007,7 @@ def main():
...
@@ -1007,7 +1007,7 @@ def main():
preds
=
preds
[
0
]
preds
=
preds
[
0
]
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
result
=
compute_metrics
(
task_name
,
preds
,
all_label_ids
.
numpy
())
result
=
compute_metrics
(
task_name
,
preds
,
all_label_ids
.
numpy
())
loss
=
tr_loss
/
nb_tr
_step
s
if
args
.
do_train
else
None
loss
=
tr_loss
/
global
_step
if
args
.
do_train
else
None
result
[
'eval_loss'
]
=
eval_loss
result
[
'eval_loss'
]
=
eval_loss
result
[
'global_step'
]
=
global_step
result
[
'global_step'
]
=
global_step
...
...
examples/run_openai_gpt.py
View file @
35e6baab
...
@@ -83,8 +83,8 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
...
@@ -83,8 +83,8 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
input_ids
[
i
,
1
,
:
len
(
with_cont2
)]
=
with_cont2
input_ids
[
i
,
1
,
:
len
(
with_cont2
)]
=
with_cont2
mc_token_ids
[
i
,
0
]
=
len
(
with_cont1
)
-
1
mc_token_ids
[
i
,
0
]
=
len
(
with_cont1
)
-
1
mc_token_ids
[
i
,
1
]
=
len
(
with_cont2
)
-
1
mc_token_ids
[
i
,
1
]
=
len
(
with_cont2
)
-
1
lm_labels
[
i
,
0
,
:
len
(
with_cont1
)
-
1
]
=
with_cont1
[
1
:]
lm_labels
[
i
,
0
,
:
len
(
with_cont1
)]
=
with_cont1
lm_labels
[
i
,
1
,
:
len
(
with_cont2
)
-
1
]
=
with_cont2
[
1
:]
lm_labels
[
i
,
1
,
:
len
(
with_cont2
)]
=
with_cont2
mc_labels
[
i
]
=
mc_label
mc_labels
[
i
]
=
mc_label
all_inputs
=
(
input_ids
,
mc_token_ids
,
lm_labels
,
mc_labels
)
all_inputs
=
(
input_ids
,
mc_token_ids
,
lm_labels
,
mc_labels
)
tensor_datasets
.
append
(
tuple
(
torch
.
tensor
(
t
)
for
t
in
all_inputs
))
tensor_datasets
.
append
(
tuple
(
torch
.
tensor
(
t
)
for
t
in
all_inputs
))
...
@@ -183,19 +183,20 @@ def main():
...
@@ -183,19 +183,20 @@ def main():
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval_batch_size
)
# Prepare optimizer
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
if
args
.
do_train
:
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
param_optimizer
=
list
(
model
.
named_parameters
())
optimizer_grouped_parameters
=
[
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
num_train_optimization_steps
=
len
(
train_data
)
*
args
.
num_train_epochs
//
args
.
train_batch_size
]
optimizer
=
OpenAIAdam
(
optimizer_grouped_parameters
,
num_train_optimization_steps
=
len
(
train_data
)
*
args
.
num_train_epochs
//
args
.
train_batch_size
lr
=
args
.
learning_rate
,
optimizer
=
OpenAIAdam
(
optimizer_grouped_parameters
,
warmup
=
args
.
warmup_proportion
,
lr
=
args
.
learning_rate
,
max_grad_norm
=
args
.
max_grad_norm
,
warmup
=
args
.
warmup_proportion
,
weight_decay
=
args
.
weight_decay
,
max_grad_norm
=
args
.
max_grad_norm
,
t_total
=
num_train_optimization_steps
)
weight_decay
=
args
.
weight_decay
,
t_total
=
num_train_optimization_steps
)
if
args
.
do_train
:
if
args
.
do_train
:
nb_tr_steps
,
tr_loss
,
exp_average_loss
=
0
,
0
,
None
nb_tr_steps
,
tr_loss
,
exp_average_loss
=
0
,
0
,
None
...
...
examples/run_squad.py
View file @
35e6baab
...
@@ -922,40 +922,41 @@ def main():
...
@@ -922,40 +922,41 @@ def main():
model
=
torch
.
nn
.
DataParallel
(
model
)
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
if
args
.
do_train
:
param_optimizer
=
list
(
model
.
named_parameters
())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
# hack to remove pooler, which is not used
param_optimizer
=
[
n
for
n
in
param_optimizer
if
'pooler'
not
in
n
[
0
]]
# thus it produce None grad that break apex
param_optimizer
=
[
n
for
n
in
param_optimizer
if
'pooler'
not
in
n
[
0
]]
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
]
if
args
.
fp16
:
try
:
if
args
.
fp16
:
from
apex.optimizers
import
FP16_Optimizer
try
:
from
apex.optimizers
import
FusedAdam
from
apex.optimizers
import
FP16_Optimizer
except
ImportError
:
from
apex.optimizers
import
FusedAdam
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
bias_correction
=
False
,
lr
=
args
.
learning_rate
,
max_grad_norm
=
1.0
)
bias_correction
=
False
,
if
args
.
loss_scale
==
0
:
max_grad_norm
=
1.0
)
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
else
:
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
lr
=
args
.
learning_rate
,
t_total
=
num_train_optimization_steps
)
warmup
=
args
.
warmup_proportion
,
else
:
t_total
=
num_train_optimization_steps
)
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
global_step
=
0
global_step
=
0
if
args
.
do_train
:
if
args
.
do_train
:
...
@@ -1015,8 +1016,7 @@ def main():
...
@@ -1015,8 +1016,7 @@ def main():
if
args
.
fp16
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used and handles this automatically
# if args.fp16 is False, BertAdam is used and handles this automatically
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
/
num_train_optimization_steps
,
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
,
args
.
warmup_proportion
)
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
step
()
...
...
examples/run_swag.py
View file @
35e6baab
...
@@ -385,39 +385,40 @@ def main():
...
@@ -385,39 +385,40 @@ def main():
model
=
torch
.
nn
.
DataParallel
(
model
)
model
=
torch
.
nn
.
DataParallel
(
model
)
# Prepare optimizer
# Prepare optimizer
param_optimizer
=
list
(
model
.
named_parameters
())
if
args
.
do_train
:
param_optimizer
=
list
(
model
.
named_parameters
())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
# hack to remove pooler, which is not used
param_optimizer
=
[
n
for
n
in
param_optimizer
if
'pooler'
not
in
n
[
0
]]
# thus it produce None grad that break apex
param_optimizer
=
[
n
for
n
in
param_optimizer
]
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
optimizer_grouped_parameters
=
[
no_decay
=
[
'bias'
,
'LayerNorm.bias'
,
'LayerNorm.weight'
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.01
},
]
{
'params'
:
[
p
for
n
,
p
in
param_optimizer
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
if
args
.
fp16
:
]
try
:
if
args
.
fp16
:
from
apex.optimizers
import
FP16_Optimizer
try
:
from
apex.optimizers
import
FusedAdam
from
apex.optimizers
import
FP16_Optimizer
except
ImportError
:
from
apex.optimizers
import
FusedAdam
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
except
ImportError
:
raise
ImportError
(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
)
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
optimizer
=
FusedAdam
(
optimizer_grouped_parameters
,
bias_correction
=
False
,
lr
=
args
.
learning_rate
,
max_grad_norm
=
1.0
)
bias_correction
=
False
,
if
args
.
loss_scale
==
0
:
max_grad_norm
=
1.0
)
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
if
args
.
loss_scale
==
0
:
optimizer
=
FP16_Optimizer
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
else
:
else
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
)
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
warmup_linear
=
WarmupLinearSchedule
(
warmup
=
args
.
warmup_proportion
,
lr
=
args
.
learning_rate
,
t_total
=
num_train_optimization_steps
)
warmup
=
args
.
warmup_proportion
,
else
:
t_total
=
num_train_optimization_steps
)
optimizer
=
BertAdam
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
warmup
=
args
.
warmup_proportion
,
t_total
=
num_train_optimization_steps
)
global_step
=
0
global_step
=
0
if
args
.
do_train
:
if
args
.
do_train
:
...
@@ -466,8 +467,7 @@ def main():
...
@@ -466,8 +467,7 @@ def main():
if
args
.
fp16
:
if
args
.
fp16
:
# modify learning rate with special warm up BERT uses
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
/
num_train_optimization_steps
,
lr_this_step
=
args
.
learning_rate
*
warmup_linear
.
get_lr
(
global_step
,
args
.
warmup_proportion
)
args
.
warmup_proportion
)
for
param_group
in
optimizer
.
param_groups
:
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
lr_this_step
param_group
[
'lr'
]
=
lr_this_step
optimizer
.
step
()
optimizer
.
step
()
...
@@ -540,7 +540,7 @@ def main():
...
@@ -540,7 +540,7 @@ def main():
result
=
{
'eval_loss'
:
eval_loss
,
result
=
{
'eval_loss'
:
eval_loss
,
'eval_accuracy'
:
eval_accuracy
,
'eval_accuracy'
:
eval_accuracy
,
'global_step'
:
global_step
,
'global_step'
:
global_step
,
'loss'
:
tr_loss
/
nb_tr
_step
s
}
'loss'
:
tr_loss
/
global
_step
}
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
open
(
output_eval_file
,
"w"
)
as
writer
:
with
open
(
output_eval_file
,
"w"
)
as
writer
:
...
...
hubconf.py
View file @
35e6baab
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.modeling
import
(
BertModel
,
BertForNextSentencePrediction
,
BertForMaskedLM
,
BertForMultipleChoice
,
BertForPreTraining
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BertForTokenClassification
,
)
dependencies
=
[
'torch'
,
'tqdm'
,
'boto3'
,
'requests'
,
'regex'
]
dependencies
=
[
'torch'
,
'tqdm'
,
'boto3'
,
'requests'
,
'regex'
]
# A lot of models share the same param doc. Use a decorator
from
hubconfs.bert_hubconf
import
(
# to save typing
bertTokenizer
,
bert_docstring
=
"""
bertModel
,
Params:
bertForNextSentencePrediction
,
pretrained_model_name_or_path: either:
bertForPreTraining
,
- a str with the name of a pre-trained model to load
bertForMaskedLM
,
. `bert-base-uncased`
bertForSequenceClassification
,
. `bert-large-uncased`
bertForMultipleChoice
,
. `bert-base-cased`
bertForQuestionAnswering
,
. `bert-large-cased`
bertForTokenClassification
. `bert-base-multilingual-uncased`
)
. `bert-base-multilingual-cased`
from
hubconfs.gpt_hubconf
import
(
. `bert-base-chinese`
openAIGPTTokenizer
,
- a path or url to a pretrained model archive containing:
openAIGPTModel
,
. `bert_config.json` a configuration file for the model
openAIGPTLMHeadModel
,
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
openAIGPTDoubleHeadsModel
instance
)
- a path or url to a pretrained model archive containing:
\ No newline at end of file
. `bert_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow
checkpoint
cache_dir: an optional path to a folder in which the pre-trained models
will be cached.
state_dict: an optional state dictionnary
(collections.OrderedDict object) to use instead of Google
pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
def
_append_from_pretrained_docstring
(
docstr
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
fn
.
__doc__
+
docstr
return
fn
return
docstring_decorator
def
bertTokenizer
(
*
args
,
**
kwargs
):
"""
Instantiate a BertTokenizer from a pre-trained/customized vocab file
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* bert-base-uncased
* bert-large-uncased
* bert-base-cased
* bert-large-cased
* bert-base-multilingual-uncased
* bert-base-multilingual-cased
* bert-base-chinese
Keyword args:
cache_dir: an optional path to a specific directory to download and cache
the pre-trained model weights.
Default: None
do_lower_case: Whether to lower case the input.
Only has an effect when do_wordpiece_only=False
Default: True
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
Default: True
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
Default: None
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
Example:
>>> sentence = 'Hello, World!'
>>> tokenizer = torch.hub.load('ailzhang/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
>>> toks = tokenizer.tokenize(sentence)
['Hello', '##,', 'World', '##!']
>>> ids = tokenizer.convert_tokens_to_ids(toks)
[8667, 28136, 1291, 28125]
"""
tokenizer
=
BertTokenizer
.
from_pretrained
(
*
args
,
**
kwargs
)
return
tokenizer
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertModel
(
*
args
,
**
kwargs
):
"""
BertModel is the basic BERT Transformer model with a layer of summed token,
position and sequence embeddings followed by a series of identical
self-attention blocks (12 for BERT-base, 24 for BERT-large).
"""
model
=
BertModel
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForNextSentencePrediction
(
*
args
,
**
kwargs
):
"""
BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence
classification head.
"""
model
=
BertForNextSentencePrediction
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForPreTraining
(
*
args
,
**
kwargs
):
"""
BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads
- the masked language modeling head, and
- the next sentence classification head.
"""
model
=
BertForPreTraining
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForMaskedLM
(
*
args
,
**
kwargs
):
"""
BertForMaskedLM includes the BertModel Transformer followed by the
(possibly) pre-trained masked language modeling head.
"""
model
=
BertForMaskedLM
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForSequenceClassification
(
*
args
,
**
kwargs
):
"""
BertForSequenceClassification is a fine-tuning model that includes
BertModel and a sequence-level (sequence or pair of sequences) classifier
on top of the BertModel.
The sequence-level classifier is a linear layer that takes as input the
last hidden state of the first character in the input sequence
(see Figures 3a and 3b in the BERT paper).
"""
model
=
BertForSequenceClassification
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForMultipleChoice
(
*
args
,
**
kwargs
):
"""
BertForMultipleChoice is a fine-tuning model that includes BertModel and a
linear layer on top of the BertModel.
"""
model
=
BertForMultipleChoice
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForQuestionAnswering
(
*
args
,
**
kwargs
):
"""
BertForQuestionAnswering is a fine-tuning model that includes BertModel
with a token-level classifiers on top of the full sequence of last hidden
states.
"""
model
=
BertForQuestionAnswering
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForTokenClassification
(
*
args
,
**
kwargs
):
"""
BertForTokenClassification is a fine-tuning model that includes BertModel
and a token-level classifier on top of the BertModel.
The token-level classifier is a linear layer that takes as input the last
hidden state of the sequence.
"""
model
=
BertForTokenClassification
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
hubconfs/bert_hubconf.py
0 → 100644
View file @
35e6baab
from
pytorch_pretrained_bert.tokenization
import
BertTokenizer
from
pytorch_pretrained_bert.modeling
import
(
BertModel
,
BertForNextSentencePrediction
,
BertForMaskedLM
,
BertForMultipleChoice
,
BertForPreTraining
,
BertForQuestionAnswering
,
BertForSequenceClassification
,
BertForTokenClassification
,
)
# A lot of models share the same param doc. Use a decorator
# to save typing
bert_docstring
=
"""
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
instance
- a path or url to a pretrained model archive containing:
. `bert_config.json` a configuration file for the model
. `model.chkpt` a TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow
checkpoint
cache_dir: an optional path to a folder in which the pre-trained models
will be cached.
state_dict: an optional state dictionnary
(collections.OrderedDict object) to use instead of Google
pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
def
_append_from_pretrained_docstring
(
docstr
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
fn
.
__doc__
+
docstr
return
fn
return
docstring_decorator
def
bertTokenizer
(
*
args
,
**
kwargs
):
"""
Instantiate a BertTokenizer from a pre-trained/customized vocab file
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* bert-base-uncased
* bert-large-uncased
* bert-base-cased
* bert-large-cased
* bert-base-multilingual-uncased
* bert-base-multilingual-cased
* bert-base-chinese
Keyword args:
cache_dir: an optional path to a specific directory to download and cache
the pre-trained model weights.
Default: None
do_lower_case: Whether to lower case the input.
Only has an effect when do_wordpiece_only=False
Default: True
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
Default: True
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
Default: None
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
Example:
>>> sentence = 'Hello, World!'
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
>>> toks = tokenizer.tokenize(sentence)
['Hello', '##,', 'World', '##!']
>>> ids = tokenizer.convert_tokens_to_ids(toks)
[8667, 28136, 1291, 28125]
"""
tokenizer
=
BertTokenizer
.
from_pretrained
(
*
args
,
**
kwargs
)
return
tokenizer
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertModel
(
*
args
,
**
kwargs
):
"""
BertModel is the basic BERT Transformer model with a layer of summed token,
position and sequence embeddings followed by a series of identical
self-attention blocks (12 for BERT-base, 24 for BERT-large).
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens])
>>> segments_tensors = torch.tensor([segments_ids])
# Load bertModel
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
>>> model.eval()
# Predict hidden states features for each layer
>>> with torch.no_grad():
encoded_layers, _ = model(tokens_tensor, segments_tensors)
"""
model
=
BertModel
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForNextSentencePrediction
(
*
args
,
**
kwargs
):
"""
BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence
classification head.
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens])
>>> segments_tensors = torch.tensor([segments_ids])
# Load bertForNextSentencePrediction
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased')
>>> model.eval()
# Predict the next sentence classification logits
>>> with torch.no_grad():
next_sent_classif_logits = model(tokens_tensor, segments_tensors)
"""
model
=
BertForNextSentencePrediction
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForPreTraining
(
*
args
,
**
kwargs
):
"""
BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads
- the masked language modeling head, and
- the next sentence classification head.
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens])
>>> segments_tensors = torch.tensor([segments_ids])
# Load bertForPreTraining
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')
>>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
"""
model
=
BertForPreTraining
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForMaskedLM
(
*
args
,
**
kwargs
):
"""
BertForMaskedLM includes the BertModel Transformer followed by the
(possibly) pre-trained masked language modeling head.
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
>>> masked_index = 8
>>> tokenized_text[masked_index] = '[MASK]'
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens])
>>> segments_tensors = torch.tensor([segments_ids])
# Load bertForMaskedLM
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')
>>> model.eval()
# Predict all tokens
>>> with torch.no_grad():
predictions = model(tokens_tensor, segments_tensors)
>>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
'henson'
"""
model
=
BertForMaskedLM
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForSequenceClassification
(
*
args
,
**
kwargs
):
"""
BertForSequenceClassification is a fine-tuning model that includes
BertModel and a sequence-level (sequence or pair of sequences) classifier
on top of the BertModel. Note that the classification head is only initialized
and has to be trained.
The sequence-level classifier is a linear layer that takes as input the
last hidden state of the first character in the input sequence
(see Figures 3a and 3b in the BERT paper).
Args:
num_labels: the number (>=2) of classes for the classifier.
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens])
>>> segments_tensors = torch.tensor([segments_ids])
# Load bertForSequenceClassification
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
>>> model.eval()
# Predict the sequence classification logits
>>> with torch.no_grad():
seq_classif_logits = model(tokens_tensor, segments_tensors)
# Or get the sequence classification loss
>>> labels = torch.tensor([1])
>>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
"""
model
=
BertForSequenceClassification
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForMultipleChoice
(
*
args
,
**
kwargs
):
"""
BertForMultipleChoice is a fine-tuning model that includes BertModel and a
linear layer on top of the BertModel. Note that the multiple choice head is
only initialized and has to be trained.
Args:
num_choices: the number (>=2) of classes for the classifier.
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
>>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
# Load bertForMultipleChoice
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
>>> model.eval()
# Predict the multiple choice logits
>>> with torch.no_grad():
multiple_choice_logits = model(tokens_tensor, segments_tensors)
# Or get the multiple choice loss
>>> labels = torch.tensor([1])
>>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
"""
model
=
BertForMultipleChoice
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForQuestionAnswering
(
*
args
,
**
kwargs
):
"""
BertForQuestionAnswering is a fine-tuning model that includes BertModel
with a token-level classifiers on top of the full sequence of last hidden
states. Note that the classification head is only initialized
and has to be trained.
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens])
>>> segments_tensors = torch.tensor([segments_ids])
# Load bertForQuestionAnswering
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased')
>>> model.eval()
# Predict the start and end positions logits
>>> with torch.no_grad():
start_logits, end_logits = model(tokens_tensor, segments_tensors)
# Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
>>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
# set model.train() before if training this loss
>>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
"""
model
=
BertForQuestionAnswering
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
bert_docstring
)
def
bertForTokenClassification
(
*
args
,
**
kwargs
):
"""
BertForTokenClassification is a fine-tuning model that includes BertModel
and a token-level classifier on top of the BertModel. Note that the classification
head is only initialized and has to be trained.
The token-level classifier is a linear layer that takes as input the last
hidden state of the sequence.
Args:
num_labels: the number (>=2) of classes for the classifier.
Example:
# Load the tokenizer
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
# Prepare tokenized input
>>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
>>> tokens_tensor = torch.tensor([indexed_tokens])
>>> segments_tensors = torch.tensor([segments_ids])
# Load bertForTokenClassification
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
>>> model.eval()
# Predict the token classification logits
>>> with torch.no_grad():
classif_logits = model(tokens_tensor, segments_tensors)
# Or get the token classification loss
>>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
>>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
"""
model
=
BertForTokenClassification
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
hubconfs/gpt_hubconf.py
0 → 100644
View file @
35e6baab
from
pytorch_pretrained_bert.tokenization_openai
import
OpenAIGPTTokenizer
from
pytorch_pretrained_bert.modeling_openai
import
(
OpenAIGPTModel
,
OpenAIGPTLMHeadModel
,
OpenAIGPTDoubleHeadsModel
)
# Dependecies that are not specified in global hubconf.py
specific_dependencies
=
[
'spacy'
,
'ftfy'
]
# A lot of models share the same param doc. Use a decorator
# to save typing
gpt_docstring
=
"""
OpenAI GPT use a single embedding matrix to store the word and special embeddings.
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
Special tokens need to be trained during the fine-tuning if you use them.
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
The embeddings are ordered as follow in the token embeddings matrice:
[0, ----------------------
... -> word embeddings
config.vocab_size - 1, ______________________
config.vocab_size,
... -> special embeddings
config.vocab_size + config.n_special - 1] ______________________
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
total_tokens_embeddings = config.vocab_size + config.n_special
You should use the associate indices to index the embeddings.
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `openai-gpt`
- a path or url to a pretrained model archive containing:
. `openai_gpt_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
- a path or url to a pretrained model archive containing:
. `openai-gpt-config.json` a configuration file for the model
. a series of NumPy files containing OpenAI TensorFlow trained weights
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object)
to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific OpenAI-GPT class
"""
def
_append_from_pretrained_docstring
(
docstr
):
def
docstring_decorator
(
fn
):
fn
.
__doc__
=
fn
.
__doc__
+
docstr
return
fn
return
docstring_decorator
def
openAIGPTTokenizer
(
*
args
,
**
kwargs
):
"""
Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
Peculiarities:
- lower case all inputs
- uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
- argument special_tokens and function set_special_tokens:
can be used to add additional symbols (ex: "__classify__") to a vocabulary.
Args:
pretrained_model_name_or_path: Path to pretrained model archive
or one of pre-trained vocab configs below.
* openai-gpt
Keyword args:
special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
Default: None
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
Default: None
Example:
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
[763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
"""
tokenizer
=
OpenAIGPTTokenizer
.
from_pretrained
(
*
args
,
**
kwargs
)
return
tokenizer
@
_append_from_pretrained_docstring
(
gpt_docstring
)
def
openAIGPTModel
(
*
args
,
**
kwargs
):
"""
OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
identical stacked masked self-attention blocks and pre-trained
on large scale dataset using language modeling signal.
Example:
# Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
# Prepare tokenized input
>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> tokens_tensor = torch.tensor([indexed_tokens])
# Load openAIGPTModel
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')
>>> model.eval()
# Predict hidden states features for each layer
>>> with torch.no_grad():
hidden_states = model(tokens_tensor)
"""
model
=
OpenAIGPTModel
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
gpt_docstring
)
def
openAIGPTLMHeadModel
(
*
args
,
**
kwargs
):
"""
OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
tied (pre-trained) language modeling head on top.
Example:
# Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
# Prepare tokenized input
>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> tokens_tensor = torch.tensor([indexed_tokens])
# Load openAIGPTLMHeadModel
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')
>>> model.eval()
# Predict hidden states features for each layer
>>> with torch.no_grad():
predictions = model(tokens_tensor)
# Get the predicted last token
>>> predicted_index = torch.argmax(predictions[0, -1, :]).item()
>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
'.</w>'
"""
model
=
OpenAIGPTLMHeadModel
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
@
_append_from_pretrained_docstring
(
gpt_docstring
)
def
openAIGPTDoubleHeadsModel
(
*
args
,
**
kwargs
):
"""
OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
tied (pre-trained) language modeling head and a multiple choice
classification head (only initialized, not pre-trained).
Example:
# Load the tokenizer
>>> import torch
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
# Prepare tokenized input
>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
>>> tokenized_text = tokenizer.tokenize(text)
>>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
>>> tokens_tensor = torch.tensor([indexed_tokens])
>>> mc_token_ids = torch.LongTensor([ [len(tokenized_text)] ])
# Load openAIGPTDoubleHeadsModel
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
>>> model.eval()
# Predict hidden states features for each layer
>>> with torch.no_grad():
lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
"""
model
=
OpenAIGPTDoubleHeadsModel
.
from_pretrained
(
*
args
,
**
kwargs
)
return
model
pytorch_pretrained_bert/file_utils.py
View file @
35e6baab
...
@@ -22,6 +22,15 @@ import requests
...
@@ -22,6 +22,15 @@ import requests
from
botocore.exceptions
import
ClientError
from
botocore.exceptions
import
ClientError
from
tqdm
import
tqdm
from
tqdm
import
tqdm
try
:
from
torch.hub
import
_get_torch_home
torch_cache_home
=
_get_torch_home
()
except
ImportError
:
torch_cache_home
=
os
.
path
.
expanduser
(
os
.
getenv
(
'TORCH_HOME'
,
os
.
path
.
join
(
os
.
getenv
(
'XDG_CACHE_HOME'
,
'~/.cache'
),
'torch'
)))
default_cache_path
=
os
.
path
.
join
(
torch_cache_home
,
'pytorch_pretrained_bert'
)
try
:
try
:
from
urllib.parse
import
urlparse
from
urllib.parse
import
urlparse
except
ImportError
:
except
ImportError
:
...
@@ -29,11 +38,11 @@ except ImportError:
...
@@ -29,11 +38,11 @@ except ImportError:
try
:
try
:
from
pathlib
import
Path
from
pathlib
import
Path
PYTORCH_PRETRAINED_BERT_CACHE
=
Path
(
os
.
getenv
(
'PYTORCH_PRETRAINED_BERT_CACHE'
,
PYTORCH_PRETRAINED_BERT_CACHE
=
Path
(
Path
.
home
()
/
'.pytor
ch_p
retrained_bert'
))
os
.
getenv
(
'PYTORCH_PRETRAINED_BERT_CACHE'
,
default_ca
ch
e
_p
ath
))
except
(
AttributeError
,
ImportError
):
except
(
AttributeError
,
ImportError
):
PYTORCH_PRETRAINED_BERT_CACHE
=
os
.
getenv
(
'PYTORCH_PRETRAINED_BERT_CACHE'
,
PYTORCH_PRETRAINED_BERT_CACHE
=
os
.
getenv
(
'PYTORCH_PRETRAINED_BERT_CACHE'
,
os
.
path
.
join
(
os
.
path
.
expanduser
(
"~"
),
'.pytor
ch_p
retrained_bert'
)
)
default_ca
ch
e
_p
ath
)
CONFIG_NAME
=
"config.json"
CONFIG_NAME
=
"config.json"
WEIGHTS_NAME
=
"pytorch_model.bin"
WEIGHTS_NAME
=
"pytorch_model.bin"
...
...
pytorch_pretrained_bert/modeling.py
View file @
35e6baab
...
@@ -145,7 +145,8 @@ class BertConfig(object):
...
@@ -145,7 +145,8 @@ class BertConfig(object):
attention_probs_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
max_position_embeddings
=
512
,
type_vocab_size
=
2
,
type_vocab_size
=
2
,
initializer_range
=
0.02
):
initializer_range
=
0.02
,
layer_norm_eps
=
1e-12
):
"""Constructs BertConfig.
"""Constructs BertConfig.
Args:
Args:
...
@@ -169,6 +170,7 @@ class BertConfig(object):
...
@@ -169,6 +170,7 @@ class BertConfig(object):
`BertModel`.
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
"""
"""
if
isinstance
(
vocab_size_or_config_json_file
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
if
isinstance
(
vocab_size_or_config_json_file
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
vocab_size_or_config_json_file
,
unicode
)):
and
isinstance
(
vocab_size_or_config_json_file
,
unicode
)):
...
@@ -188,6 +190,7 @@ class BertConfig(object):
...
@@ -188,6 +190,7 @@ class BertConfig(object):
self
.
max_position_embeddings
=
max_position_embeddings
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_vocab_size
=
type_vocab_size
self
.
initializer_range
=
initializer_range
self
.
initializer_range
=
initializer_range
self
.
layer_norm_eps
=
layer_norm_eps
else
:
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
"or the path to a pretrained model config file (str)"
)
...
@@ -254,7 +257,7 @@ class BertEmbeddings(nn.Module):
...
@@ -254,7 +257,7 @@ class BertEmbeddings(nn.Module):
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
# any TensorFlow checkpoint file
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
...
@@ -332,7 +335,7 @@ class BertSelfOutput(nn.Module):
...
@@ -332,7 +335,7 @@ class BertSelfOutput(nn.Module):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
BertSelfOutput
,
self
).
__init__
()
super
(
BertSelfOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
def
forward
(
self
,
hidden_states
,
input_tensor
):
...
@@ -378,7 +381,7 @@ class BertOutput(nn.Module):
...
@@ -378,7 +381,7 @@ class BertOutput(nn.Module):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
BertOutput
,
self
).
__init__
()
super
(
BertOutput
,
self
).
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
self
.
dense
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
,
input_tensor
):
def
forward
(
self
,
hidden_states
,
input_tensor
):
...
@@ -454,7 +457,7 @@ class BertPredictionHeadTransform(nn.Module):
...
@@ -454,7 +457,7 @@ class BertPredictionHeadTransform(nn.Module):
self
.
transform_act_fn
=
ACT2FN
[
config
.
hidden_act
]
self
.
transform_act_fn
=
ACT2FN
[
config
.
hidden_act
]
else
:
else
:
self
.
transform_act_fn
=
config
.
hidden_act
self
.
transform_act_fn
=
config
.
hidden_act
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
1e-12
)
self
.
LayerNorm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
def
forward
(
self
,
hidden_states
):
def
forward
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dense
(
hidden_states
)
...
@@ -1020,7 +1023,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
...
@@ -1020,7 +1023,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
logits = model(input_ids, token_type_ids, input_mask)
logits = model(input_ids, token_type_ids, input_mask)
```
```
"""
"""
def
__init__
(
self
,
config
,
num_labels
,
output_attentions
=
False
):
def
__init__
(
self
,
config
,
num_labels
=
2
,
output_attentions
=
False
):
super
(
BertForSequenceClassification
,
self
).
__init__
(
config
)
super
(
BertForSequenceClassification
,
self
).
__init__
(
config
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
num_labels
=
num_labels
self
.
num_labels
=
num_labels
...
@@ -1091,7 +1094,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
...
@@ -1091,7 +1094,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
logits = model(input_ids, token_type_ids, input_mask)
logits = model(input_ids, token_type_ids, input_mask)
```
```
"""
"""
def
__init__
(
self
,
config
,
num_choices
,
output_attentions
=
False
):
def
__init__
(
self
,
config
,
num_choices
=
2
,
output_attentions
=
False
):
super
(
BertForMultipleChoice
,
self
).
__init__
(
config
)
super
(
BertForMultipleChoice
,
self
).
__init__
(
config
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
num_choices
=
num_choices
self
.
num_choices
=
num_choices
...
@@ -1167,7 +1170,7 @@ class BertForTokenClassification(BertPreTrainedModel):
...
@@ -1167,7 +1170,7 @@ class BertForTokenClassification(BertPreTrainedModel):
logits = model(input_ids, token_type_ids, input_mask)
logits = model(input_ids, token_type_ids, input_mask)
```
```
"""
"""
def
__init__
(
self
,
config
,
num_labels
,
output_attentions
=
False
):
def
__init__
(
self
,
config
,
num_labels
=
2
,
output_attentions
=
False
):
super
(
BertForTokenClassification
,
self
).
__init__
(
config
)
super
(
BertForTokenClassification
,
self
).
__init__
(
config
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
num_labels
=
num_labels
self
.
num_labels
=
num_labels
...
...
pytorch_pretrained_bert/modeling_openai.py
View file @
35e6baab
...
@@ -434,9 +434,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
...
@@ -434,9 +434,7 @@ class OpenAIGPTPreTrainedModel(nn.Module):
module
.
bias
.
data
.
zero_
()
module
.
bias
.
data
.
zero_
()
@
classmethod
@
classmethod
def
from_pretrained
(
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
num_special_tokens
=
None
,
*
inputs
,
**
kwargs
):
cls
,
pretrained_model_name_or_path
,
num_special_tokens
=
None
,
state_dict
=
None
,
cache_dir
=
None
,
from_tf
=
False
,
*
inputs
,
**
kwargs
):
"""
"""
Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Download and cache the pre-trained model file if needed.
...
@@ -449,14 +447,20 @@ class OpenAIGPTPreTrainedModel(nn.Module):
...
@@ -449,14 +447,20 @@ class OpenAIGPTPreTrainedModel(nn.Module):
. `openai_gpt_config.json` a configuration file for the model
. `openai_gpt_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
- a path or url to a pretrained model archive containing:
- a path or url to a pretrained model archive containing:
. `
bert_
config.json` a configuration file for the model
. `
openai-gpt-
config.json` a configuration file for the model
. a series of NumPy files containing OpenAI TensorFlow trained weights
. a series of NumPy files containing OpenAI TensorFlow trained weights
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
*inputs, **kwargs: additional input for the specific OpenAI-GPT class
(ex: num_labels for BertForSequenceClassification)
"""
"""
state_dict
=
kwargs
.
get
(
'state_dict'
,
None
)
kwargs
.
pop
(
'state_dict'
,
None
)
cache_dir
=
kwargs
.
get
(
'cache_dir'
,
None
)
kwargs
.
pop
(
'cache_dir'
,
None
)
from_tf
=
kwargs
.
get
(
'from_tf'
,
False
)
kwargs
.
pop
(
'from_tf'
,
None
)
if
pretrained_model_name_or_path
in
PRETRAINED_MODEL_ARCHIVE_MAP
:
if
pretrained_model_name_or_path
in
PRETRAINED_MODEL_ARCHIVE_MAP
:
archive_file
=
PRETRAINED_MODEL_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
archive_file
=
PRETRAINED_MODEL_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
config_file
=
PRETRAINED_CONFIG_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
config_file
=
PRETRAINED_CONFIG_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment