Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5676d6f7
Commit
5676d6f7
authored
Nov 03, 2018
by
VictorSanh
Browse files
Remove BERT pretraining files for now
parent
8ec457d3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
889 deletions
+0
-889
create_pretraining_data_pytorch.py
create_pretraining_data_pytorch.py
+0
-429
run_pretraining_pytorch.py
run_pretraining_pytorch.py
+0
-460
No files found.
create_pretraining_data_pytorch.py
deleted
100644 → 0
View file @
8ec457d3
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
random
import
tokenization
import
tensorflow
as
tf
import
argparse
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--input_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Input raw text file (or comma-separated list of files)."
)
parser
.
add_argument
(
"--output_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Output TF example file (or comma-separated list of files)."
)
parser
.
add_argument
(
"--vocab_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The vocabulary file that the BERT model was trained on."
)
## Other parameters
parser
.
add_argument
(
"--do_lower_case"
,
default
=
True
,
action
=
'store_true'
,
help
=
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models."
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"Maximum sequence length."
)
parser
.
add_argument
(
"--max_predictions_per_seq"
,
default
=
20
,
type
=
int
,
help
=
"Maximum number of masked LM predictions per sequence."
)
parser
.
add_argument
(
"--random_seed"
,
default
=
12345
,
type
=
int
,
help
=
"Random seed for data generation."
)
parser
.
add_argument
(
"--dupe_factor"
,
default
=
10
,
type
=
int
,
help
=
"Number of times to duplicate the input data (with different masks)."
)
parser
.
add_argument
(
"--masked_lm_prob"
,
default
=
0.15
,
type
=
float
,
help
=
"Masked LM probability."
)
parser
.
add_argument
(
"--short_seq_prob"
,
default
=
0.1
,
type
=
float
,
help
=
"Probability of creating sequences which are shorter than the maximum length."
)
args
=
parser
.
parse_args
()
class
TrainingInstance
(
object
):
"""A single training instance (sentence pair)."""
def
__init__
(
self
,
tokens
,
segment_ids
,
masked_lm_positions
,
masked_lm_labels
,
is_random_next
):
self
.
tokens
=
tokens
self
.
segment_ids
=
segment_ids
self
.
is_random_next
=
is_random_next
self
.
masked_lm_positions
=
masked_lm_positions
self
.
masked_lm_labels
=
masked_lm_labels
def
__str__
(
self
):
s
=
""
s
+=
"tokens: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
tokens
]))
s
+=
"segment_ids: %s
\n
"
%
(
" "
.
join
([
str
(
x
)
for
x
in
self
.
segment_ids
]))
s
+=
"is_random_next: %s
\n
"
%
self
.
is_random_next
s
+=
"masked_lm_positions: %s
\n
"
%
(
" "
.
join
(
[
str
(
x
)
for
x
in
self
.
masked_lm_positions
]))
s
+=
"masked_lm_labels: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
masked_lm_labels
]))
s
+=
"
\n
"
return
s
def
__repr__
(
self
):
return
self
.
__str__
()
def
write_instance_to_example_files
(
instances
,
tokenizer
,
max_seq_length
,
max_predictions_per_seq
,
output_files
):
"""Create TF example files from `TrainingInstance`s."""
writers
=
[]
for
output_file
in
output_files
:
writers
.
append
(
tf
.
python_io
.
TFRecordWriter
(
output_file
))
writer_index
=
0
total_written
=
0
for
(
inst_index
,
instance
)
in
enumerate
(
instances
):
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
instance
.
tokens
)
input_mask
=
[
1
]
*
len
(
input_ids
)
segment_ids
=
list
(
instance
.
segment_ids
)
assert
len
(
input_ids
)
<=
max_seq_length
while
len
(
input_ids
)
<
max_seq_length
:
input_ids
.
append
(
0
)
input_mask
.
append
(
0
)
segment_ids
.
append
(
0
)
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
masked_lm_positions
=
list
(
instance
.
masked_lm_positions
)
masked_lm_ids
=
tokenizer
.
convert_tokens_to_ids
(
instance
.
masked_lm_labels
)
masked_lm_weights
=
[
1.0
]
*
len
(
masked_lm_ids
)
while
len
(
masked_lm_positions
)
<
max_predictions_per_seq
:
masked_lm_positions
.
append
(
0
)
masked_lm_ids
.
append
(
0
)
masked_lm_weights
.
append
(
0.0
)
next_sentence_label
=
1
if
instance
.
is_random_next
else
0
features
=
collections
.
OrderedDict
()
features
[
"input_ids"
]
=
create_int_feature
(
input_ids
)
features
[
"input_mask"
]
=
create_int_feature
(
input_mask
)
features
[
"segment_ids"
]
=
create_int_feature
(
segment_ids
)
features
[
"masked_lm_positions"
]
=
create_int_feature
(
masked_lm_positions
)
features
[
"masked_lm_ids"
]
=
create_int_feature
(
masked_lm_ids
)
features
[
"masked_lm_weights"
]
=
create_float_feature
(
masked_lm_weights
)
features
[
"next_sentence_labels"
]
=
create_int_feature
([
next_sentence_label
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writers
[
writer_index
].
write
(
tf_example
.
SerializeToString
())
writer_index
=
(
writer_index
+
1
)
%
len
(
writers
)
total_written
+=
1
if
inst_index
<
20
:
tf
.
logging
.
info
(
"*** Example ***"
)
tf
.
logging
.
info
(
"tokens: %s"
%
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
instance
.
tokens
]))
for
feature_name
in
features
.
keys
():
feature
=
features
[
feature_name
]
values
=
[]
if
feature
.
int64_list
.
value
:
values
=
feature
.
int64_list
.
value
elif
feature
.
float_list
.
value
:
values
=
feature
.
float_list
.
value
tf
.
logging
.
info
(
"%s: %s"
%
(
feature_name
,
" "
.
join
([
str
(
x
)
for
x
in
values
])))
for
writer
in
writers
:
writer
.
close
()
tf
.
logging
.
info
(
"Wrote %d total instances"
,
total_written
)
def
create_int_feature
(
values
):
feature
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
feature
def
create_float_feature
(
values
):
feature
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
feature
def
create_training_instances
(
input_files
,
tokenizer
,
max_seq_length
,
dupe_factor
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
rng
):
"""Create `TrainingInstance`s from raw text."""
all_documents
=
[[]]
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.
for
input_file
in
input_files
:
with
tf
.
gfile
.
GFile
(
input_file
,
"r"
)
as
reader
:
while
True
:
line
=
tokenization
.
convert_to_unicode
(
reader
.
readline
())
if
not
line
:
break
line
=
line
.
strip
()
# Empty lines are used as document delimiters
if
not
line
:
all_documents
.
append
([])
tokens
=
tokenizer
.
tokenize
(
line
)
if
tokens
:
all_documents
[
-
1
].
append
(
tokens
)
# Remove empty documents
all_documents
=
[
x
for
x
in
all_documents
if
x
]
rng
.
shuffle
(
all_documents
)
vocab_words
=
list
(
tokenizer
.
vocab
.
keys
())
instances
=
[]
for
_
in
range
(
dupe_factor
):
for
document_index
in
range
(
len
(
all_documents
)):
instances
.
extend
(
create_instances_from_document
(
all_documents
,
document_index
,
max_seq_length
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
))
rng
.
shuffle
(
instances
)
return
instances
def
create_instances_from_document
(
all_documents
,
document_index
,
max_seq_length
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
):
"""Creates `TrainingInstance`s for a single document."""
document
=
all_documents
[
document_index
]
# Account for [CLS], [SEP], [SEP]
max_num_tokens
=
max_seq_length
-
3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length
=
max_num_tokens
if
rng
.
random
()
<
short_seq_prob
:
target_seq_length
=
rng
.
randint
(
2
,
max_num_tokens
)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances
=
[]
current_chunk
=
[]
current_length
=
0
i
=
0
while
i
<
len
(
document
):
segment
=
document
[
i
]
current_chunk
.
append
(
segment
)
current_length
+=
len
(
segment
)
if
i
==
len
(
document
)
-
1
or
current_length
>=
target_seq_length
:
if
current_chunk
:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end
=
1
if
len
(
current_chunk
)
>=
2
:
a_end
=
rng
.
randint
(
1
,
len
(
current_chunk
)
-
1
)
tokens_a
=
[]
for
j
in
range
(
a_end
):
tokens_a
.
extend
(
current_chunk
[
j
])
tokens_b
=
[]
# Random next
is_random_next
=
False
if
len
(
current_chunk
)
==
1
or
rng
.
random
()
<
0.5
:
is_random_next
=
True
target_b_length
=
target_seq_length
-
len
(
tokens_a
)
# This should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document
# we're processing.
for
_
in
range
(
10
):
random_document_index
=
rng
.
randint
(
0
,
len
(
all_documents
)
-
1
)
if
random_document_index
!=
document_index
:
break
random_document
=
all_documents
[
random_document_index
]
random_start
=
rng
.
randint
(
0
,
len
(
random_document
)
-
1
)
for
j
in
range
(
random_start
,
len
(
random_document
)):
tokens_b
.
extend
(
random_document
[
j
])
if
len
(
tokens_b
)
>=
target_b_length
:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments
=
len
(
current_chunk
)
-
a_end
i
-=
num_unused_segments
# Actual next
else
:
is_random_next
=
False
for
j
in
range
(
a_end
,
len
(
current_chunk
)):
tokens_b
.
extend
(
current_chunk
[
j
])
truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_num_tokens
,
rng
)
assert
len
(
tokens_a
)
>=
1
assert
len
(
tokens_b
)
>=
1
tokens
=
[]
segment_ids
=
[]
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_b
:
tokens
.
append
(
token
)
segment_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
(
tokens
,
masked_lm_positions
,
masked_lm_labels
)
=
create_masked_lm_predictions
(
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
)
instance
=
TrainingInstance
(
tokens
=
tokens
,
segment_ids
=
segment_ids
,
is_random_next
=
is_random_next
,
masked_lm_positions
=
masked_lm_positions
,
masked_lm_labels
=
masked_lm_labels
)
instances
.
append
(
instance
)
current_chunk
=
[]
current_length
=
0
i
+=
1
return
instances
def
create_masked_lm_predictions
(
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
):
"""Creates the predictis for the masked LM objective."""
cand_indexes
=
[]
for
(
i
,
token
)
in
enumerate
(
tokens
):
if
token
==
"[CLS]"
or
token
==
"[SEP]"
:
continue
cand_indexes
.
append
(
i
)
rng
.
shuffle
(
cand_indexes
)
output_tokens
=
list
(
tokens
)
masked_lm
=
collections
.
namedtuple
(
"masked_lm"
,
[
"index"
,
"label"
])
# pylint: disable=invalid-name
num_to_predict
=
min
(
max_predictions_per_seq
,
max
(
1
,
int
(
round
(
len
(
tokens
)
*
masked_lm_prob
))))
masked_lms
=
[]
covered_indexes
=
set
()
for
index
in
cand_indexes
:
if
len
(
masked_lms
)
>=
num_to_predict
:
break
if
index
in
covered_indexes
:
continue
covered_indexes
.
add
(
index
)
masked_token
=
None
# 80% of the time, replace with [MASK]
if
rng
.
random
()
<
0.8
:
masked_token
=
"[MASK]"
else
:
# 10% of the time, keep original
if
rng
.
random
()
<
0.5
:
masked_token
=
tokens
[
index
]
# 10% of the time, replace with random word
else
:
masked_token
=
vocab_words
[
rng
.
randint
(
0
,
len
(
vocab_words
)
-
1
)]
output_tokens
[
index
]
=
masked_token
masked_lms
.
append
(
masked_lm
(
index
=
index
,
label
=
tokens
[
index
]))
masked_lms
=
sorted
(
masked_lms
,
key
=
lambda
x
:
x
.
index
)
masked_lm_positions
=
[]
masked_lm_labels
=
[]
for
p
in
masked_lms
:
masked_lm_positions
.
append
(
p
.
index
)
masked_lm_labels
.
append
(
p
.
label
)
return
(
output_tokens
,
masked_lm_positions
,
masked_lm_labels
)
def
truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_num_tokens
,
rng
):
"""Truncates a pair of sequences to a maximum sequence length."""
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_num_tokens
:
break
trunc_tokens
=
tokens_a
if
len
(
tokens_a
)
>
len
(
tokens_b
)
else
tokens_b
assert
len
(
trunc_tokens
)
>=
1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if
rng
.
random
()
<
0.5
:
del
trunc_tokens
[
0
]
else
:
trunc_tokens
.
pop
()
def
main
(
_
):
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
args
.
vocab_file
,
do_lower_case
=
args
.
do_lower_case
)
input_files
=
[]
for
input_pattern
in
args
.
input_file
.
split
(
","
):
input_files
.
extend
(
tf
.
gfile
.
Glob
(
input_pattern
))
tf
.
logging
.
info
(
"*** Reading from input files ***"
)
for
input_file
in
input_files
:
tf
.
logging
.
info
(
" %s"
,
input_file
)
rng
=
random
.
Random
(
args
.
random_seed
)
instances
=
create_training_instances
(
input_files
,
tokenizer
,
args
.
max_seq_length
,
args
.
dupe_factor
,
args
.
short_seq_prob
,
args
.
masked_lm_prob
,
args
.
max_predictions_per_seq
,
rng
)
output_files
=
args
.
output_file
.
split
(
","
)
tf
.
logging
.
info
(
"*** Writing to output files ***"
)
for
output_file
in
output_files
:
tf
.
logging
.
info
(
" %s"
,
output_file
)
write_instance_to_example_files
(
instances
,
tokenizer
,
args
.
max_seq_length
,
args
.
max_predictions_per_seq
,
output_files
)
if
__name__
==
"__main__"
:
tf
.
app
.
run
()
run_pretraining_pytorch.py
deleted
100644 → 0
View file @
8ec457d3
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run masked LM/next sentence masked_lm pre-training for BERT."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
modeling
import
optimization
import
tensorflow
as
tf
import
argparse
def
model_fn_builder
(
bert_config
,
init_checkpoint
,
learning_rate
,
num_train_steps
,
num_warmup_steps
,
use_tpu
,
use_one_hot_embeddings
):
"""Returns `model_fn` closure for TPUEstimator."""
def
model_fn
(
features
,
labels
,
mode
,
params
):
# pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
tf
.
logging
.
info
(
"*** Features ***"
)
for
name
in
sorted
(
features
.
keys
()):
tf
.
logging
.
info
(
" name = %s, shape = %s"
%
(
name
,
features
[
name
].
shape
))
input_ids
=
features
[
"input_ids"
]
input_mask
=
features
[
"input_mask"
]
segment_ids
=
features
[
"segment_ids"
]
masked_lm_positions
=
features
[
"masked_lm_positions"
]
masked_lm_ids
=
features
[
"masked_lm_ids"
]
masked_lm_weights
=
features
[
"masked_lm_weights"
]
next_sentence_labels
=
features
[
"next_sentence_labels"
]
is_training
=
(
mode
==
tf
.
estimator
.
ModeKeys
.
TRAIN
)
model
=
modeling
.
BertModel
(
config
=
bert_config
,
is_training
=
is_training
,
input_ids
=
input_ids
,
input_mask
=
input_mask
,
token_type_ids
=
segment_ids
,
use_one_hot_embeddings
=
use_one_hot_embeddings
)
(
masked_lm_loss
,
masked_lm_example_loss
,
masked_lm_log_probs
)
=
get_masked_lm_output
(
bert_config
,
model
.
get_sequence_output
(),
model
.
get_embedding_table
(),
masked_lm_positions
,
masked_lm_ids
,
masked_lm_weights
)
(
next_sentence_loss
,
next_sentence_example_loss
,
next_sentence_log_probs
)
=
get_next_sentence_output
(
bert_config
,
model
.
get_pooled_output
(),
next_sentence_labels
)
total_loss
=
masked_lm_loss
+
next_sentence_loss
tvars
=
tf
.
trainable_variables
()
initialized_variable_names
=
{}
scaffold_fn
=
None
if
init_checkpoint
:
(
assignment_map
,
initialized_variable_names
)
=
modeling
.
get_assigment_map_from_checkpoint
(
tvars
,
init_checkpoint
)
if
use_tpu
:
def
tpu_scaffold
():
tf
.
train
.
init_from_checkpoint
(
init_checkpoint
,
assignment_map
)
return
tf
.
train
.
Scaffold
()
scaffold_fn
=
tpu_scaffold
else
:
tf
.
train
.
init_from_checkpoint
(
init_checkpoint
,
assignment_map
)
tf
.
logging
.
info
(
"**** Trainable Variables ****"
)
for
var
in
tvars
:
init_string
=
""
if
var
.
name
in
initialized_variable_names
:
init_string
=
", *INIT_FROM_CKPT*"
tf
.
logging
.
info
(
" name = %s, shape = %s%s"
,
var
.
name
,
var
.
shape
,
init_string
)
output_spec
=
None
if
mode
==
tf
.
estimator
.
ModeKeys
.
TRAIN
:
train_op
=
optimization
.
create_optimizer
(
total_loss
,
learning_rate
,
num_train_steps
,
num_warmup_steps
,
use_tpu
)
output_spec
=
tf
.
contrib
.
tpu
.
TPUEstimatorSpec
(
mode
=
mode
,
loss
=
total_loss
,
train_op
=
train_op
,
scaffold_fn
=
scaffold_fn
)
elif
mode
==
tf
.
estimator
.
ModeKeys
.
EVAL
:
def
metric_fn
(
masked_lm_example_loss
,
masked_lm_log_probs
,
masked_lm_ids
,
masked_lm_weights
,
next_sentence_example_loss
,
next_sentence_log_probs
,
next_sentence_labels
):
"""Computes the loss and accuracy of the model."""
masked_lm_log_probs
=
tf
.
reshape
(
masked_lm_log_probs
,
[
-
1
,
masked_lm_log_probs
.
shape
[
-
1
]])
masked_lm_predictions
=
tf
.
argmax
(
masked_lm_log_probs
,
axis
=-
1
,
output_type
=
tf
.
int32
)
masked_lm_example_loss
=
tf
.
reshape
(
masked_lm_example_loss
,
[
-
1
])
masked_lm_ids
=
tf
.
reshape
(
masked_lm_ids
,
[
-
1
])
masked_lm_weights
=
tf
.
reshape
(
masked_lm_weights
,
[
-
1
])
masked_lm_accuracy
=
tf
.
metrics
.
accuracy
(
labels
=
masked_lm_ids
,
predictions
=
masked_lm_predictions
,
weights
=
masked_lm_weights
)
masked_lm_mean_loss
=
tf
.
metrics
.
mean
(
values
=
masked_lm_example_loss
,
weights
=
masked_lm_weights
)
next_sentence_log_probs
=
tf
.
reshape
(
next_sentence_log_probs
,
[
-
1
,
next_sentence_log_probs
.
shape
[
-
1
]])
next_sentence_predictions
=
tf
.
argmax
(
next_sentence_log_probs
,
axis
=-
1
,
output_type
=
tf
.
int32
)
next_sentence_labels
=
tf
.
reshape
(
next_sentence_labels
,
[
-
1
])
next_sentence_accuracy
=
tf
.
metrics
.
accuracy
(
labels
=
next_sentence_labels
,
predictions
=
next_sentence_predictions
)
next_sentence_mean_loss
=
tf
.
metrics
.
mean
(
values
=
next_sentence_example_loss
)
return
{
"masked_lm_accuracy"
:
masked_lm_accuracy
,
"masked_lm_loss"
:
masked_lm_mean_loss
,
"next_sentence_accuracy"
:
next_sentence_accuracy
,
"next_sentence_loss"
:
next_sentence_mean_loss
,
}
eval_metrics
=
(
metric_fn
,
[
masked_lm_example_loss
,
masked_lm_log_probs
,
masked_lm_ids
,
masked_lm_weights
,
next_sentence_example_loss
,
next_sentence_log_probs
,
next_sentence_labels
])
output_spec
=
tf
.
contrib
.
tpu
.
TPUEstimatorSpec
(
mode
=
mode
,
loss
=
total_loss
,
eval_metrics
=
eval_metrics
,
scaffold_fn
=
scaffold_fn
)
else
:
raise
ValueError
(
"Only TRAIN and EVAL modes are supported: %s"
%
(
mode
))
return
output_spec
return
model_fn
def
get_masked_lm_output
(
bert_config
,
input_tensor
,
output_weights
,
positions
,
label_ids
,
label_weights
):
"""Get loss and log probs for the masked LM."""
input_tensor
=
gather_indexes
(
input_tensor
,
positions
)
with
tf
.
variable_scope
(
"cls/predictions"
):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with
tf
.
variable_scope
(
"transform"
):
input_tensor
=
tf
.
layers
.
dense
(
input_tensor
,
units
=
bert_config
.
hidden_size
,
activation
=
modeling
.
get_activation
(
bert_config
.
hidden_act
),
kernel_initializer
=
modeling
.
create_initializer
(
bert_config
.
initializer_range
))
input_tensor
=
modeling
.
layer_norm
(
input_tensor
)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias
=
tf
.
get_variable
(
"output_bias"
,
shape
=
[
bert_config
.
vocab_size
],
initializer
=
tf
.
zeros_initializer
())
logits
=
tf
.
matmul
(
input_tensor
,
output_weights
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
logits
,
output_bias
)
log_probs
=
tf
.
nn
.
log_softmax
(
logits
,
axis
=-
1
)
label_ids
=
tf
.
reshape
(
label_ids
,
[
-
1
])
label_weights
=
tf
.
reshape
(
label_weights
,
[
-
1
])
one_hot_labels
=
tf
.
one_hot
(
label_ids
,
depth
=
bert_config
.
vocab_size
,
dtype
=
tf
.
float32
)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss
=
-
tf
.
reduce_sum
(
log_probs
*
one_hot_labels
,
axis
=
[
-
1
])
numerator
=
tf
.
reduce_sum
(
label_weights
*
per_example_loss
)
denominator
=
tf
.
reduce_sum
(
label_weights
)
+
1e-5
loss
=
numerator
/
denominator
return
(
loss
,
per_example_loss
,
log_probs
)
def
get_next_sentence_output
(
bert_config
,
input_tensor
,
labels
):
"""Get loss and log probs for the next sentence prediction."""
# Simple binary classification. Note that 0 is "next sentence" and 1 is
# "random sentence". This weight matrix is not used after pre-training.
with
tf
.
variable_scope
(
"cls/seq_relationship"
):
output_weights
=
tf
.
get_variable
(
"output_weights"
,
shape
=
[
2
,
bert_config
.
hidden_size
],
initializer
=
modeling
.
create_initializer
(
bert_config
.
initializer_range
))
output_bias
=
tf
.
get_variable
(
"output_bias"
,
shape
=
[
2
],
initializer
=
tf
.
zeros_initializer
())
logits
=
tf
.
matmul
(
input_tensor
,
output_weights
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
logits
,
output_bias
)
log_probs
=
tf
.
nn
.
log_softmax
(
logits
,
axis
=-
1
)
labels
=
tf
.
reshape
(
labels
,
[
-
1
])
one_hot_labels
=
tf
.
one_hot
(
labels
,
depth
=
2
,
dtype
=
tf
.
float32
)
per_example_loss
=
-
tf
.
reduce_sum
(
one_hot_labels
*
log_probs
,
axis
=-
1
)
loss
=
tf
.
reduce_mean
(
per_example_loss
)
return
(
loss
,
per_example_loss
,
log_probs
)
def
gather_indexes
(
sequence_tensor
,
positions
):
"""Gathers the vectors at the specific positions over a minibatch."""
sequence_shape
=
modeling
.
get_shape_list
(
sequence_tensor
,
expected_rank
=
3
)
batch_size
=
sequence_shape
[
0
]
seq_length
=
sequence_shape
[
1
]
width
=
sequence_shape
[
2
]
flat_offsets
=
tf
.
reshape
(
tf
.
range
(
0
,
batch_size
,
dtype
=
tf
.
int32
)
*
seq_length
,
[
-
1
,
1
])
flat_positions
=
tf
.
reshape
(
positions
+
flat_offsets
,
[
-
1
])
flat_sequence_tensor
=
tf
.
reshape
(
sequence_tensor
,
[
batch_size
*
seq_length
,
width
])
output_tensor
=
tf
.
gather
(
flat_sequence_tensor
,
flat_positions
)
return
output_tensor
def
input_fn_builder
(
input_files
,
max_seq_length
,
max_predictions_per_seq
,
is_training
,
num_cpu_threads
=
4
):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
def
input_fn
(
params
):
"""The actual input function."""
batch_size
=
params
[
"batch_size"
]
name_to_features
=
{
"input_ids"
:
tf
.
FixedLenFeature
([
max_seq_length
],
tf
.
int64
),
"input_mask"
:
tf
.
FixedLenFeature
([
max_seq_length
],
tf
.
int64
),
"segment_ids"
:
tf
.
FixedLenFeature
([
max_seq_length
],
tf
.
int64
),
"masked_lm_positions"
:
tf
.
FixedLenFeature
([
max_predictions_per_seq
],
tf
.
int64
),
"masked_lm_ids"
:
tf
.
FixedLenFeature
([
max_predictions_per_seq
],
tf
.
int64
),
"masked_lm_weights"
:
tf
.
FixedLenFeature
([
max_predictions_per_seq
],
tf
.
float32
),
"next_sentence_labels"
:
tf
.
FixedLenFeature
([
1
],
tf
.
int64
),
}
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if
is_training
:
d
=
tf
.
data
.
Dataset
.
from_tensor_slices
(
tf
.
constant
(
input_files
))
d
=
d
.
repeat
()
d
=
d
.
shuffle
(
buffer_size
=
len
(
input_files
))
# `cycle_length` is the number of parallel files that get read.
cycle_length
=
min
(
num_cpu_threads
,
len
(
input_files
))
# `sloppy` mode means that the interleaving is not exact. This adds
# even more randomness to the training pipeline.
d
=
d
.
apply
(
tf
.
contrib
.
data
.
parallel_interleave
(
tf
.
data
.
TFRecordDataset
,
sloppy
=
is_training
,
cycle_length
=
cycle_length
))
d
=
d
.
shuffle
(
buffer_size
=
100
)
else
:
d
=
tf
.
data
.
TFRecordDataset
(
input_files
)
# Since we evaluate for a fixed number of steps we don't want to encounter
# out-of-range exceptions.
d
=
d
.
repeat
()
# We must `drop_remainder` on training because the TPU requires fixed
# size dimensions. For eval, we assume we are evaling on the CPU or GPU
# and we *don"t* want to drop the remainder, otherwise we wont cover
# every sample.
d
=
d
.
apply
(
tf
.
contrib
.
data
.
map_and_batch
(
lambda
record
:
_decode_record
(
record
,
name_to_features
),
batch_size
=
batch_size
,
num_parallel_batches
=
num_cpu_threads
,
drop_remainder
=
True
))
return
d
return
input_fn
def
_decode_record
(
record
,
name_to_features
):
"""Decodes a record to a TensorFlow example."""
example
=
tf
.
parse_single_example
(
record
,
name_to_features
)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for
name
in
list
(
example
.
keys
()):
t
=
example
[
name
]
if
t
.
dtype
==
tf
.
int64
:
t
=
tf
.
to_int32
(
t
)
example
[
name
]
=
t
return
example
def
main
(
_
):
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--bert_config_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture."
)
parser
.
add_argument
(
"--input_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Input TF example files (can be a glob or comma separated)."
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model checkpoints will be written."
)
## Other parameters
parser
.
add_argument
(
"--init_checkpoint"
,
default
=
None
,
type
=
str
,
help
=
"Initial checkpoint (usually from a pre-trained BERT model)."
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization. Sequences longer "
"than this will be truncated, and sequences shorter than this will be padded. "
"Must match data generation."
)
parser
.
add_argument
(
"--max_predictions_per_seq"
,
default
=
20
,
type
=
int
,
help
=
"Maximum number of masked LM predictions per sequence. Must match data generation."
)
parser
.
add_argument
(
"--do_train"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--num_train_steps"
,
default
=
100000
,
type
=
int
,
help
=
"Number of training steps."
)
parser
.
add_argument
(
"--num_warmup_steps"
,
default
=
10000
,
type
=
int
,
help
=
"Number of warmup steps."
)
parser
.
add_argument
(
"--save_checkpoints_steps"
,
default
=
1000
,
type
=
int
,
help
=
"How often to save the model checkpoint."
)
parser
.
add_argument
(
"--iterations_per_loop"
,
default
=
1000
,
type
=
int
,
help
=
"How many steps to make in each estimator call."
)
parser
.
add_argument
(
"--max_eval_steps"
,
default
=
100
,
type
=
int
,
help
=
"Maximum number of eval steps."
)
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser
.
add_argument
(
"--use_tpu"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to use TPU or GPU/CPU."
)
parser
.
add_argument
(
"--tpu_name"
,
default
=
None
,
type
=
str
,
help
=
"The Cloud TPU to use for training. This should be either the name used when creating the "
"Cloud TPU, or a grpc://ip.address.of.tpu:8470 url."
)
parser
.
add_argument
(
"--tpu_zone"
,
default
=
None
,
type
=
str
,
help
=
"[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
"to automatically detect the GCE project from metadata."
)
parser
.
add_argument
(
"--gcp_project"
,
default
=
None
,
type
=
str
,
help
=
"[Optional] Project name for the Cloud TPU-enabled project. If not specified, "
"we will attempt to automatically detect the GCE project from metadata."
)
parser
.
add_argument
(
"--master"
,
default
=
None
,
type
=
str
,
help
=
"[Optional] TensorFlow master URL."
)
parser
.
add_argument
(
"--num_tpu_cores"
,
default
=
8
,
type
=
int
,
help
=
"Only used if `use_tpu` is True. Total number of TPU cores to use."
)
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
args
=
parser
.
parse_args
()
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
if
not
args
.
do_train
and
not
args
.
do_eval
:
raise
ValueError
(
"At least one of `do_train` or `do_eval` must be True."
)
bert_config
=
modeling
.
BertConfig
.
from_json_file
(
args
.
bert_config_file
)
tf
.
gfile
.
MakeDirs
(
args
.
output_dir
)
input_files
=
[]
for
input_pattern
in
args
.
input_file
.
split
(
","
):
input_files
.
extend
(
tf
.
gfile
.
Glob
(
input_pattern
))
tf
.
logging
.
info
(
"*** Input Files ***"
)
for
input_file
in
input_files
:
tf
.
logging
.
info
(
" %s"
%
input_file
)
tpu_cluster_resolver
=
None
if
args
.
use_tpu
and
args
.
tpu_name
:
tpu_cluster_resolver
=
tf
.
contrib
.
cluster_resolver
.
TPUClusterResolver
(
args
.
tpu_name
,
zone
=
args
.
tpu_zone
,
project
=
args
.
gcp_project
)
is_per_host
=
tf
.
contrib
.
tpu
.
InputPipelineConfig
.
PER_HOST_V2
run_config
=
tf
.
contrib
.
tpu
.
RunConfig
(
cluster
=
tpu_cluster_resolver
,
master
=
args
.
master
,
model_dir
=
args
.
output_dir
,
save_checkpoints_steps
=
args
.
save_checkpoints_steps
,
tpu_config
=
tf
.
contrib
.
tpu
.
TPUConfig
(
iterations_per_loop
=
args
.
iterations_per_loop
,
num_shards
=
args
.
num_tpu_cores
,
per_host_input_for_training
=
is_per_host
))
model_fn
=
model_fn_builder
(
bert_config
=
bert_config
,
init_checkpoint
=
args
.
init_checkpoint
,
learning_rate
=
args
.
learning_rate
,
num_train_steps
=
args
.
num_train_steps
,
num_warmup_steps
=
args
.
num_warmup_steps
,
use_tpu
=
args
.
use_tpu
,
use_one_hot_embeddings
=
args
.
use_tpu
)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator
=
tf
.
contrib
.
tpu
.
TPUEstimator
(
use_tpu
=
args
.
use_tpu
,
model_fn
=
model_fn
,
config
=
run_config
,
train_batch_size
=
args
.
train_batch_size
,
eval_batch_size
=
args
.
eval_batch_size
)
if
args
.
do_train
:
tf
.
logging
.
info
(
"***** Running training *****"
)
tf
.
logging
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
train_input_fn
=
input_fn_builder
(
input_files
=
input_files
,
max_seq_length
=
args
.
max_seq_length
,
max_predictions_per_seq
=
args
.
max_predictions_per_seq
,
is_training
=
True
)
estimator
.
train
(
input_fn
=
train_input_fn
,
max_steps
=
args
.
num_train_steps
)
if
args
.
do_eval
:
tf
.
logging
.
info
(
"***** Running evaluation *****"
)
tf
.
logging
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
eval_input_fn
=
input_fn_builder
(
input_files
=
input_files
,
max_seq_length
=
args
.
max_seq_length
,
max_predictions_per_seq
=
args
.
max_predictions_per_seq
,
is_training
=
False
)
result
=
estimator
.
evaluate
(
input_fn
=
eval_input_fn
,
steps
=
args
.
max_eval_steps
)
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
tf
.
gfile
.
GFile
(
output_eval_file
,
"w"
)
as
writer
:
tf
.
logging
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
tf
.
logging
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"__main__"
:
tf
.
app
.
run
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment