Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5676d6f7
Commit
5676d6f7
authored
Nov 03, 2018
by
VictorSanh
Browse files
Remove BERT pretraining files for now
parent
8ec457d3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
889 deletions
+0
-889
create_pretraining_data_pytorch.py
create_pretraining_data_pytorch.py
+0
-429
run_pretraining_pytorch.py
run_pretraining_pytorch.py
+0
-460
No files found.
create_pretraining_data_pytorch.py
deleted
100644 → 0
View file @
8ec457d3
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
random
import
tokenization
import
tensorflow
as
tf
import
argparse
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--input_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Input raw text file (or comma-separated list of files)."
)
parser
.
add_argument
(
"--output_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Output TF example file (or comma-separated list of files)."
)
parser
.
add_argument
(
"--vocab_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The vocabulary file that the BERT model was trained on."
)
## Other parameters
parser
.
add_argument
(
"--do_lower_case"
,
default
=
True
,
action
=
'store_true'
,
help
=
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models."
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"Maximum sequence length."
)
parser
.
add_argument
(
"--max_predictions_per_seq"
,
default
=
20
,
type
=
int
,
help
=
"Maximum number of masked LM predictions per sequence."
)
parser
.
add_argument
(
"--random_seed"
,
default
=
12345
,
type
=
int
,
help
=
"Random seed for data generation."
)
parser
.
add_argument
(
"--dupe_factor"
,
default
=
10
,
type
=
int
,
help
=
"Number of times to duplicate the input data (with different masks)."
)
parser
.
add_argument
(
"--masked_lm_prob"
,
default
=
0.15
,
type
=
float
,
help
=
"Masked LM probability."
)
parser
.
add_argument
(
"--short_seq_prob"
,
default
=
0.1
,
type
=
float
,
help
=
"Probability of creating sequences which are shorter than the maximum length."
)
args
=
parser
.
parse_args
()
class
TrainingInstance
(
object
):
"""A single training instance (sentence pair)."""
def
__init__
(
self
,
tokens
,
segment_ids
,
masked_lm_positions
,
masked_lm_labels
,
is_random_next
):
self
.
tokens
=
tokens
self
.
segment_ids
=
segment_ids
self
.
is_random_next
=
is_random_next
self
.
masked_lm_positions
=
masked_lm_positions
self
.
masked_lm_labels
=
masked_lm_labels
def
__str__
(
self
):
s
=
""
s
+=
"tokens: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
tokens
]))
s
+=
"segment_ids: %s
\n
"
%
(
" "
.
join
([
str
(
x
)
for
x
in
self
.
segment_ids
]))
s
+=
"is_random_next: %s
\n
"
%
self
.
is_random_next
s
+=
"masked_lm_positions: %s
\n
"
%
(
" "
.
join
(
[
str
(
x
)
for
x
in
self
.
masked_lm_positions
]))
s
+=
"masked_lm_labels: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
masked_lm_labels
]))
s
+=
"
\n
"
return
s
def
__repr__
(
self
):
return
self
.
__str__
()
def
write_instance_to_example_files
(
instances
,
tokenizer
,
max_seq_length
,
max_predictions_per_seq
,
output_files
):
"""Create TF example files from `TrainingInstance`s."""
writers
=
[]
for
output_file
in
output_files
:
writers
.
append
(
tf
.
python_io
.
TFRecordWriter
(
output_file
))
writer_index
=
0
total_written
=
0
for
(
inst_index
,
instance
)
in
enumerate
(
instances
):
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
instance
.
tokens
)
input_mask
=
[
1
]
*
len
(
input_ids
)
segment_ids
=
list
(
instance
.
segment_ids
)
assert
len
(
input_ids
)
<=
max_seq_length
while
len
(
input_ids
)
<
max_seq_length
:
input_ids
.
append
(
0
)
input_mask
.
append
(
0
)
segment_ids
.
append
(
0
)
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
masked_lm_positions
=
list
(
instance
.
masked_lm_positions
)
masked_lm_ids
=
tokenizer
.
convert_tokens_to_ids
(
instance
.
masked_lm_labels
)
masked_lm_weights
=
[
1.0
]
*
len
(
masked_lm_ids
)
while
len
(
masked_lm_positions
)
<
max_predictions_per_seq
:
masked_lm_positions
.
append
(
0
)
masked_lm_ids
.
append
(
0
)
masked_lm_weights
.
append
(
0.0
)
next_sentence_label
=
1
if
instance
.
is_random_next
else
0
features
=
collections
.
OrderedDict
()
features
[
"input_ids"
]
=
create_int_feature
(
input_ids
)
features
[
"input_mask"
]
=
create_int_feature
(
input_mask
)
features
[
"segment_ids"
]
=
create_int_feature
(
segment_ids
)
features
[
"masked_lm_positions"
]
=
create_int_feature
(
masked_lm_positions
)
features
[
"masked_lm_ids"
]
=
create_int_feature
(
masked_lm_ids
)
features
[
"masked_lm_weights"
]
=
create_float_feature
(
masked_lm_weights
)
features
[
"next_sentence_labels"
]
=
create_int_feature
([
next_sentence_label
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writers
[
writer_index
].
write
(
tf_example
.
SerializeToString
())
writer_index
=
(
writer_index
+
1
)
%
len
(
writers
)
total_written
+=
1
if
inst_index
<
20
:
tf
.
logging
.
info
(
"*** Example ***"
)
tf
.
logging
.
info
(
"tokens: %s"
%
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
instance
.
tokens
]))
for
feature_name
in
features
.
keys
():
feature
=
features
[
feature_name
]
values
=
[]
if
feature
.
int64_list
.
value
:
values
=
feature
.
int64_list
.
value
elif
feature
.
float_list
.
value
:
values
=
feature
.
float_list
.
value
tf
.
logging
.
info
(
"%s: %s"
%
(
feature_name
,
" "
.
join
([
str
(
x
)
for
x
in
values
])))
for
writer
in
writers
:
writer
.
close
()
tf
.
logging
.
info
(
"Wrote %d total instances"
,
total_written
)
def
create_int_feature
(
values
):
feature
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
feature
def
create_float_feature
(
values
):
feature
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
feature
def
create_training_instances
(
input_files
,
tokenizer
,
max_seq_length
,
dupe_factor
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
rng
):
"""Create `TrainingInstance`s from raw text."""
all_documents
=
[[]]
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.
for
input_file
in
input_files
:
with
tf
.
gfile
.
GFile
(
input_file
,
"r"
)
as
reader
:
while
True
:
line
=
tokenization
.
convert_to_unicode
(
reader
.
readline
())
if
not
line
:
break
line
=
line
.
strip
()
# Empty lines are used as document delimiters
if
not
line
:
all_documents
.
append
([])
tokens
=
tokenizer
.
tokenize
(
line
)
if
tokens
:
all_documents
[
-
1
].
append
(
tokens
)
# Remove empty documents
all_documents
=
[
x
for
x
in
all_documents
if
x
]
rng
.
shuffle
(
all_documents
)
vocab_words
=
list
(
tokenizer
.
vocab
.
keys
())
instances
=
[]
for
_
in
range
(
dupe_factor
):
for
document_index
in
range
(
len
(
all_documents
)):
instances
.
extend
(
create_instances_from_document
(
all_documents
,
document_index
,
max_seq_length
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
))
rng
.
shuffle
(
instances
)
return
instances
def
create_instances_from_document
(
all_documents
,
document_index
,
max_seq_length
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
):
"""Creates `TrainingInstance`s for a single document."""
document
=
all_documents
[
document_index
]
# Account for [CLS], [SEP], [SEP]
max_num_tokens
=
max_seq_length
-
3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length
=
max_num_tokens
if
rng
.
random
()
<
short_seq_prob
:
target_seq_length
=
rng
.
randint
(
2
,
max_num_tokens
)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances
=
[]
current_chunk
=
[]
current_length
=
0
i
=
0
while
i
<
len
(
document
):
segment
=
document
[
i
]
current_chunk
.
append
(
segment
)
current_length
+=
len
(
segment
)
if
i
==
len
(
document
)
-
1
or
current_length
>=
target_seq_length
:
if
current_chunk
:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end
=
1
if
len
(
current_chunk
)
>=
2
:
a_end
=
rng
.
randint
(
1
,
len
(
current_chunk
)
-
1
)
tokens_a
=
[]
for
j
in
range
(
a_end
):
tokens_a
.
extend
(
current_chunk
[
j
])
tokens_b
=
[]
# Random next
is_random_next
=
False
if
len
(
current_chunk
)
==
1
or
rng
.
random
()
<
0.5
:
is_random_next
=
True
target_b_length
=
target_seq_length
-
len
(
tokens_a
)
# This should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document
# we're processing.
for
_
in
range
(
10
):
random_document_index
=
rng
.
randint
(
0
,
len
(
all_documents
)
-
1
)
if
random_document_index
!=
document_index
:
break
random_document
=
all_documents
[
random_document_index
]
random_start
=
rng
.
randint
(
0
,
len
(
random_document
)
-
1
)
for
j
in
range
(
random_start
,
len
(
random_document
)):
tokens_b
.
extend
(
random_document
[
j
])
if
len
(
tokens_b
)
>=
target_b_length
:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments
=
len
(
current_chunk
)
-
a_end
i
-=
num_unused_segments
# Actual next
else
:
is_random_next
=
False
for
j
in
range
(
a_end
,
len
(
current_chunk
)):
tokens_b
.
extend
(
current_chunk
[
j
])
truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_num_tokens
,
rng
)
assert
len
(
tokens_a
)
>=
1
assert
len
(
tokens_b
)
>=
1
tokens
=
[]
segment_ids
=
[]
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_b
:
tokens
.
append
(
token
)
segment_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
(
tokens
,
masked_lm_positions
,
masked_lm_labels
)
=
create_masked_lm_predictions
(
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
)
instance
=
TrainingInstance
(
tokens
=
tokens
,
segment_ids
=
segment_ids
,
is_random_next
=
is_random_next
,
masked_lm_positions
=
masked_lm_positions
,
masked_lm_labels
=
masked_lm_labels
)
instances
.
append
(
instance
)
current_chunk
=
[]
current_length
=
0
i
+=
1
return
instances
def
create_masked_lm_predictions
(
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
):
"""Creates the predictis for the masked LM objective."""
cand_indexes
=
[]
for
(
i
,
token
)
in
enumerate
(
tokens
):
if
token
==
"[CLS]"
or
token
==
"[SEP]"
:
continue
cand_indexes
.
append
(
i
)
rng
.
shuffle
(
cand_indexes
)
output_tokens
=
list
(
tokens
)
masked_lm
=
collections
.
namedtuple
(
"masked_lm"
,
[
"index"
,
"label"
])
# pylint: disable=invalid-name
num_to_predict
=
min
(
max_predictions_per_seq
,
max
(
1
,
int
(
round
(
len
(
tokens
)
*
masked_lm_prob
))))
masked_lms
=
[]
covered_indexes
=
set
()
for
index
in
cand_indexes
:
if
len
(
masked_lms
)
>=
num_to_predict
:
break
if
index
in
covered_indexes
:
continue
covered_indexes
.
add
(
index
)
masked_token
=
None
# 80% of the time, replace with [MASK]
if
rng
.
random
()
<
0.8
:
masked_token
=
"[MASK]"
else
:
# 10% of the time, keep original
if
rng
.
random
()
<
0.5
:
masked_token
=
tokens
[
index
]
# 10% of the time, replace with random word
else
:
masked_token
=
vocab_words
[
rng
.
randint
(
0
,
len
(
vocab_words
)
-
1
)]
output_tokens
[
index
]
=
masked_token
masked_lms
.
append
(
masked_lm
(
index
=
index
,
label
=
tokens
[
index
]))
masked_lms
=
sorted
(
masked_lms
,
key
=
lambda
x
:
x
.
index
)
masked_lm_positions
=
[]
masked_lm_labels
=
[]
for
p
in
masked_lms
:
masked_lm_positions
.
append
(
p
.
index
)
masked_lm_labels
.
append
(
p
.
label
)
return
(
output_tokens
,
masked_lm_positions
,
masked_lm_labels
)
def
truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_num_tokens
,
rng
):
"""Truncates a pair of sequences to a maximum sequence length."""
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_num_tokens
:
break
trunc_tokens
=
tokens_a
if
len
(
tokens_a
)
>
len
(
tokens_b
)
else
tokens_b
assert
len
(
trunc_tokens
)
>=
1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if
rng
.
random
()
<
0.5
:
del
trunc_tokens
[
0
]
else
:
trunc_tokens
.
pop
()
def
main
(
_
):
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
args
.
vocab_file
,
do_lower_case
=
args
.
do_lower_case
)
input_files
=
[]
for
input_pattern
in
args
.
input_file
.
split
(
","
):
input_files
.
extend
(
tf
.
gfile
.
Glob
(
input_pattern
))
tf
.
logging
.
info
(
"*** Reading from input files ***"
)
for
input_file
in
input_files
:
tf
.
logging
.
info
(
" %s"
,
input_file
)
rng
=
random
.
Random
(
args
.
random_seed
)
instances
=
create_training_instances
(
input_files
,
tokenizer
,
args
.
max_seq_length
,
args
.
dupe_factor
,
args
.
short_seq_prob
,
args
.
masked_lm_prob
,
args
.
max_predictions_per_seq
,
rng
)
output_files
=
args
.
output_file
.
split
(
","
)
tf
.
logging
.
info
(
"*** Writing to output files ***"
)
for
output_file
in
output_files
:
tf
.
logging
.
info
(
" %s"
,
output_file
)
write_instance_to_example_files
(
instances
,
tokenizer
,
args
.
max_seq_length
,
args
.
max_predictions_per_seq
,
output_files
)
if
__name__
==
"__main__"
:
tf
.
app
.
run
()
run_pretraining_pytorch.py
deleted
100644 → 0
View file @
8ec457d3
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run masked LM/next sentence masked_lm pre-training for BERT."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
modeling
import
optimization
import
tensorflow
as
tf
import
argparse
def
model_fn_builder
(
bert_config
,
init_checkpoint
,
learning_rate
,
num_train_steps
,
num_warmup_steps
,
use_tpu
,
use_one_hot_embeddings
):
"""Returns `model_fn` closure for TPUEstimator."""
def
model_fn
(
features
,
labels
,
mode
,
params
):
# pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
tf
.
logging
.
info
(
"*** Features ***"
)
for
name
in
sorted
(
features
.
keys
()):
tf
.
logging
.
info
(
" name = %s, shape = %s"
%
(
name
,
features
[
name
].
shape
))
input_ids
=
features
[
"input_ids"
]
input_mask
=
features
[
"input_mask"
]
segment_ids
=
features
[
"segment_ids"
]
masked_lm_positions
=
features
[
"masked_lm_positions"
]
masked_lm_ids
=
features
[
"masked_lm_ids"
]
masked_lm_weights
=
features
[
"masked_lm_weights"
]
next_sentence_labels
=
features
[
"next_sentence_labels"
]
is_training
=
(
mode
==
tf
.
estimator
.
ModeKeys
.
TRAIN
)
model
=
modeling
.
BertModel
(
config
=
bert_config
,
is_training
=
is_training
,
input_ids
=
input_ids
,
input_mask
=
input_mask
,
token_type_ids
=
segment_ids
,
use_one_hot_embeddings
=
use_one_hot_embeddings
)
(
masked_lm_loss
,
masked_lm_example_loss
,
masked_lm_log_probs
)
=
get_masked_lm_output
(
bert_config
,
model
.
get_sequence_output
(),
model
.
get_embedding_table
(),
masked_lm_positions
,
masked_lm_ids
,
masked_lm_weights
)
(
next_sentence_loss
,
next_sentence_example_loss
,
next_sentence_log_probs
)
=
get_next_sentence_output
(
bert_config
,
model
.
get_pooled_output
(),
next_sentence_labels
)
total_loss
=
masked_lm_loss
+
next_sentence_loss
tvars
=
tf
.
trainable_variables
()
initialized_variable_names
=
{}
scaffold_fn
=
None
if
init_checkpoint
:
(
assignment_map
,
initialized_variable_names
)
=
modeling
.
get_assigment_map_from_checkpoint
(
tvars
,
init_checkpoint
)
if
use_tpu
:
def
tpu_scaffold
():
tf
.
train
.
init_from_checkpoint
(
init_checkpoint
,
assignment_map
)
return
tf
.
train
.
Scaffold
()
scaffold_fn
=
tpu_scaffold
else
:
tf
.
train
.
init_from_checkpoint
(
init_checkpoint
,
assignment_map
)
tf
.
logging
.
info
(
"**** Trainable Variables ****"
)
for
var
in
tvars
:
init_string
=
""
if
var
.
name
in
initialized_variable_names
:
init_string
=
", *INIT_FROM_CKPT*"
tf
.
logging
.
info
(
" name = %s, shape = %s%s"
,
var
.
name
,
var
.
shape
,
init_string
)
output_spec
=
None
if
mode
==
tf
.
estimator
.
ModeKeys
.
TRAIN
:
train_op
=
optimization
.
create_optimizer
(
total_loss
,
learning_rate
,
num_train_steps
,
num_warmup_steps
,
use_tpu
)
output_spec
=
tf
.
contrib
.
tpu
.
TPUEstimatorSpec
(
mode
=
mode
,
loss
=
total_loss
,
train_op
=
train_op
,
scaffold_fn
=
scaffold_fn
)
elif
mode
==
tf
.
estimator
.
ModeKeys
.
EVAL
:
def
metric_fn
(
masked_lm_example_loss
,
masked_lm_log_probs
,
masked_lm_ids
,
masked_lm_weights
,
next_sentence_example_loss
,
next_sentence_log_probs
,
next_sentence_labels
):
"""Computes the loss and accuracy of the model."""
masked_lm_log_probs
=
tf
.
reshape
(
masked_lm_log_probs
,
[
-
1
,
masked_lm_log_probs
.
shape
[
-
1
]])
masked_lm_predictions
=
tf
.
argmax
(
masked_lm_log_probs
,
axis
=-
1
,
output_type
=
tf
.
int32
)
masked_lm_example_loss
=
tf
.
reshape
(
masked_lm_example_loss
,
[
-
1
])
masked_lm_ids
=
tf
.
reshape
(
masked_lm_ids
,
[
-
1
])
masked_lm_weights
=
tf
.
reshape
(
masked_lm_weights
,
[
-
1
])
masked_lm_accuracy
=
tf
.
metrics
.
accuracy
(
labels
=
masked_lm_ids
,
predictions
=
masked_lm_predictions
,
weights
=
masked_lm_weights
)
masked_lm_mean_loss
=
tf
.
metrics
.
mean
(
values
=
masked_lm_example_loss
,
weights
=
masked_lm_weights
)
next_sentence_log_probs
=
tf
.
reshape
(
next_sentence_log_probs
,
[
-
1
,
next_sentence_log_probs
.
shape
[
-
1
]])
next_sentence_predictions
=
tf
.
argmax
(
next_sentence_log_probs
,
axis
=-
1
,
output_type
=
tf
.
int32
)
next_sentence_labels
=
tf
.
reshape
(
next_sentence_labels
,
[
-
1
])
next_sentence_accuracy
=
tf
.
metrics
.
accuracy
(
labels
=
next_sentence_labels
,
predictions
=
next_sentence_predictions
)
next_sentence_mean_loss
=
tf
.
metrics
.
mean
(
values
=
next_sentence_example_loss
)
return
{
"masked_lm_accuracy"
:
masked_lm_accuracy
,
"masked_lm_loss"
:
masked_lm_mean_loss
,
"next_sentence_accuracy"
:
next_sentence_accuracy
,
"next_sentence_loss"
:
next_sentence_mean_loss
,
}
eval_metrics
=
(
metric_fn
,
[
masked_lm_example_loss
,
masked_lm_log_probs
,
masked_lm_ids
,
masked_lm_weights
,
next_sentence_example_loss
,
next_sentence_log_probs
,
next_sentence_labels
])
output_spec
=
tf
.
contrib
.
tpu
.
TPUEstimatorSpec
(
mode
=
mode
,
loss
=
total_loss
,
eval_metrics
=
eval_metrics
,
scaffold_fn
=
scaffold_fn
)
else
:
raise
ValueError
(
"Only TRAIN and EVAL modes are supported: %s"
%
(
mode
))
return
output_spec
return
model_fn
def
get_masked_lm_output
(
bert_config
,
input_tensor
,
output_weights
,
positions
,
label_ids
,
label_weights
):
"""Get loss and log probs for the masked LM."""
input_tensor
=
gather_indexes
(
input_tensor
,
positions
)
with
tf
.
variable_scope
(
"cls/predictions"
):
# We apply one more non-linear transformation before the output layer.
# This matrix is not used after pre-training.
with
tf
.
variable_scope
(
"transform"
):
input_tensor
=
tf
.
layers
.
dense
(
input_tensor
,
units
=
bert_config
.
hidden_size
,
activation
=
modeling
.
get_activation
(
bert_config
.
hidden_act
),
kernel_initializer
=
modeling
.
create_initializer
(
bert_config
.
initializer_range
))
input_tensor
=
modeling
.
layer_norm
(
input_tensor
)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
output_bias
=
tf
.
get_variable
(
"output_bias"
,
shape
=
[
bert_config
.
vocab_size
],
initializer
=
tf
.
zeros_initializer
())
logits
=
tf
.
matmul
(
input_tensor
,
output_weights
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
logits
,
output_bias
)
log_probs
=
tf
.
nn
.
log_softmax
(
logits
,
axis
=-
1
)
label_ids
=
tf
.
reshape
(
label_ids
,
[
-
1
])
label_weights
=
tf
.
reshape
(
label_weights
,
[
-
1
])
one_hot_labels
=
tf
.
one_hot
(
label_ids
,
depth
=
bert_config
.
vocab_size
,
dtype
=
tf
.
float32
)
# The `positions` tensor might be zero-padded (if the sequence is too
# short to have the maximum number of predictions). The `label_weights`
# tensor has a value of 1.0 for every real prediction and 0.0 for the
# padding predictions.
per_example_loss
=
-
tf
.
reduce_sum
(
log_probs
*
one_hot_labels
,
axis
=
[
-
1
])
numerator
=
tf
.
reduce_sum
(
label_weights
*
per_example_loss
)
denominator
=
tf
.
reduce_sum
(
label_weights
)
+
1e-5
loss
=
numerator
/
denominator
return
(
loss
,
per_example_loss
,
log_probs
)
def
get_next_sentence_output
(
bert_config
,
input_tensor
,
labels
):
"""Get loss and log probs for the next sentence prediction."""
# Simple binary classification. Note that 0 is "next sentence" and 1 is
# "random sentence". This weight matrix is not used after pre-training.
with
tf
.
variable_scope
(
"cls/seq_relationship"
):
output_weights
=
tf
.
get_variable
(
"output_weights"
,
shape
=
[
2
,
bert_config
.
hidden_size
],
initializer
=
modeling
.
create_initializer
(
bert_config
.
initializer_range
))
output_bias
=
tf
.
get_variable
(
"output_bias"
,
shape
=
[
2
],
initializer
=
tf
.
zeros_initializer
())
logits
=
tf
.
matmul
(
input_tensor
,
output_weights
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
logits
,
output_bias
)
log_probs
=
tf
.
nn
.
log_softmax
(
logits
,
axis
=-
1
)
labels
=
tf
.
reshape
(
labels
,
[
-
1
])
one_hot_labels
=
tf
.
one_hot
(
labels
,
depth
=
2
,
dtype
=
tf
.
float32
)
per_example_loss
=
-
tf
.
reduce_sum
(
one_hot_labels
*
log_probs
,
axis
=-
1
)
loss
=
tf
.
reduce_mean
(
per_example_loss
)
return
(
loss
,
per_example_loss
,
log_probs
)
def
gather_indexes
(
sequence_tensor
,
positions
):
"""Gathers the vectors at the specific positions over a minibatch."""
sequence_shape
=
modeling
.
get_shape_list
(
sequence_tensor
,
expected_rank
=
3
)
batch_size
=
sequence_shape
[
0
]
seq_length
=
sequence_shape
[
1
]
width
=
sequence_shape
[
2
]
flat_offsets
=
tf
.
reshape
(
tf
.
range
(
0
,
batch_size
,
dtype
=
tf
.
int32
)
*
seq_length
,
[
-
1
,
1
])
flat_positions
=
tf
.
reshape
(
positions
+
flat_offsets
,
[
-
1
])
flat_sequence_tensor
=
tf
.
reshape
(
sequence_tensor
,
[
batch_size
*
seq_length
,
width
])
output_tensor
=
tf
.
gather
(
flat_sequence_tensor
,
flat_positions
)
return
output_tensor
def
input_fn_builder
(
input_files
,
max_seq_length
,
max_predictions_per_seq
,
is_training
,
num_cpu_threads
=
4
):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
def
input_fn
(
params
):
"""The actual input function."""
batch_size
=
params
[
"batch_size"
]
name_to_features
=
{
"input_ids"
:
tf
.
FixedLenFeature
([
max_seq_length
],
tf
.
int64
),
"input_mask"
:
tf
.
FixedLenFeature
([
max_seq_length
],
tf
.
int64
),
"segment_ids"
:
tf
.
FixedLenFeature
([
max_seq_length
],
tf
.
int64
),
"masked_lm_positions"
:
tf
.
FixedLenFeature
([
max_predictions_per_seq
],
tf
.
int64
),
"masked_lm_ids"
:
tf
.
FixedLenFeature
([
max_predictions_per_seq
],
tf
.
int64
),
"masked_lm_weights"
:
tf
.
FixedLenFeature
([
max_predictions_per_seq
],
tf
.
float32
),
"next_sentence_labels"
:
tf
.
FixedLenFeature
([
1
],
tf
.
int64
),
}
# For training, we want a lot of parallel reading and shuffling.
# For eval, we want no shuffling and parallel reading doesn't matter.
if
is_training
:
d
=
tf
.
data
.
Dataset
.
from_tensor_slices
(
tf
.
constant
(
input_files
))
d
=
d
.
repeat
()
d
=
d
.
shuffle
(
buffer_size
=
len
(
input_files
))
# `cycle_length` is the number of parallel files that get read.
cycle_length
=
min
(
num_cpu_threads
,
len
(
input_files
))
# `sloppy` mode means that the interleaving is not exact. This adds
# even more randomness to the training pipeline.
d
=
d
.
apply
(
tf
.
contrib
.
data
.
parallel_interleave
(
tf
.
data
.
TFRecordDataset
,
sloppy
=
is_training
,
cycle_length
=
cycle_length
))
d
=
d
.
shuffle
(
buffer_size
=
100
)
else
:
d
=
tf
.
data
.
TFRecordDataset
(
input_files
)
# Since we evaluate for a fixed number of steps we don't want to encounter
# out-of-range exceptions.
d
=
d
.
repeat
()
# We must `drop_remainder` on training because the TPU requires fixed
# size dimensions. For eval, we assume we are evaling on the CPU or GPU
# and we *don"t* want to drop the remainder, otherwise we wont cover
# every sample.
d
=
d
.
apply
(
tf
.
contrib
.
data
.
map_and_batch
(
lambda
record
:
_decode_record
(
record
,
name_to_features
),
batch_size
=
batch_size
,
num_parallel_batches
=
num_cpu_threads
,
drop_remainder
=
True
))
return
d
return
input_fn
def
_decode_record
(
record
,
name_to_features
):
"""Decodes a record to a TensorFlow example."""
example
=
tf
.
parse_single_example
(
record
,
name_to_features
)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for
name
in
list
(
example
.
keys
()):
t
=
example
[
name
]
if
t
.
dtype
==
tf
.
int64
:
t
=
tf
.
to_int32
(
t
)
example
[
name
]
=
t
return
example
def
main
(
_
):
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--bert_config_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture."
)
parser
.
add_argument
(
"--input_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Input TF example files (can be a glob or comma separated)."
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model checkpoints will be written."
)
## Other parameters
parser
.
add_argument
(
"--init_checkpoint"
,
default
=
None
,
type
=
str
,
help
=
"Initial checkpoint (usually from a pre-trained BERT model)."
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization. Sequences longer "
"than this will be truncated, and sequences shorter than this will be padded. "
"Must match data generation."
)
parser
.
add_argument
(
"--max_predictions_per_seq"
,
default
=
20
,
type
=
int
,
help
=
"Maximum number of masked LM predictions per sequence. Must match data generation."
)
parser
.
add_argument
(
"--do_train"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for eval."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--num_train_steps"
,
default
=
100000
,
type
=
int
,
help
=
"Number of training steps."
)
parser
.
add_argument
(
"--num_warmup_steps"
,
default
=
10000
,
type
=
int
,
help
=
"Number of warmup steps."
)
parser
.
add_argument
(
"--save_checkpoints_steps"
,
default
=
1000
,
type
=
int
,
help
=
"How often to save the model checkpoint."
)
parser
.
add_argument
(
"--iterations_per_loop"
,
default
=
1000
,
type
=
int
,
help
=
"How many steps to make in each estimator call."
)
parser
.
add_argument
(
"--max_eval_steps"
,
default
=
100
,
type
=
int
,
help
=
"Maximum number of eval steps."
)
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser
.
add_argument
(
"--use_tpu"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to use TPU or GPU/CPU."
)
parser
.
add_argument
(
"--tpu_name"
,
default
=
None
,
type
=
str
,
help
=
"The Cloud TPU to use for training. This should be either the name used when creating the "
"Cloud TPU, or a grpc://ip.address.of.tpu:8470 url."
)
parser
.
add_argument
(
"--tpu_zone"
,
default
=
None
,
type
=
str
,
help
=
"[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
"to automatically detect the GCE project from metadata."
)
parser
.
add_argument
(
"--gcp_project"
,
default
=
None
,
type
=
str
,
help
=
"[Optional] Project name for the Cloud TPU-enabled project. If not specified, "
"we will attempt to automatically detect the GCE project from metadata."
)
parser
.
add_argument
(
"--master"
,
default
=
None
,
type
=
str
,
help
=
"[Optional] TensorFlow master URL."
)
parser
.
add_argument
(
"--num_tpu_cores"
,
default
=
8
,
type
=
int
,
help
=
"Only used if `use_tpu` is True. Total number of TPU cores to use."
)
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
args
=
parser
.
parse_args
()
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
if
not
args
.
do_train
and
not
args
.
do_eval
:
raise
ValueError
(
"At least one of `do_train` or `do_eval` must be True."
)
bert_config
=
modeling
.
BertConfig
.
from_json_file
(
args
.
bert_config_file
)
tf
.
gfile
.
MakeDirs
(
args
.
output_dir
)
input_files
=
[]
for
input_pattern
in
args
.
input_file
.
split
(
","
):
input_files
.
extend
(
tf
.
gfile
.
Glob
(
input_pattern
))
tf
.
logging
.
info
(
"*** Input Files ***"
)
for
input_file
in
input_files
:
tf
.
logging
.
info
(
" %s"
%
input_file
)
tpu_cluster_resolver
=
None
if
args
.
use_tpu
and
args
.
tpu_name
:
tpu_cluster_resolver
=
tf
.
contrib
.
cluster_resolver
.
TPUClusterResolver
(
args
.
tpu_name
,
zone
=
args
.
tpu_zone
,
project
=
args
.
gcp_project
)
is_per_host
=
tf
.
contrib
.
tpu
.
InputPipelineConfig
.
PER_HOST_V2
run_config
=
tf
.
contrib
.
tpu
.
RunConfig
(
cluster
=
tpu_cluster_resolver
,
master
=
args
.
master
,
model_dir
=
args
.
output_dir
,
save_checkpoints_steps
=
args
.
save_checkpoints_steps
,
tpu_config
=
tf
.
contrib
.
tpu
.
TPUConfig
(
iterations_per_loop
=
args
.
iterations_per_loop
,
num_shards
=
args
.
num_tpu_cores
,
per_host_input_for_training
=
is_per_host
))
model_fn
=
model_fn_builder
(
bert_config
=
bert_config
,
init_checkpoint
=
args
.
init_checkpoint
,
learning_rate
=
args
.
learning_rate
,
num_train_steps
=
args
.
num_train_steps
,
num_warmup_steps
=
args
.
num_warmup_steps
,
use_tpu
=
args
.
use_tpu
,
use_one_hot_embeddings
=
args
.
use_tpu
)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator
=
tf
.
contrib
.
tpu
.
TPUEstimator
(
use_tpu
=
args
.
use_tpu
,
model_fn
=
model_fn
,
config
=
run_config
,
train_batch_size
=
args
.
train_batch_size
,
eval_batch_size
=
args
.
eval_batch_size
)
if
args
.
do_train
:
tf
.
logging
.
info
(
"***** Running training *****"
)
tf
.
logging
.
info
(
" Batch size = %d"
,
args
.
train_batch_size
)
train_input_fn
=
input_fn_builder
(
input_files
=
input_files
,
max_seq_length
=
args
.
max_seq_length
,
max_predictions_per_seq
=
args
.
max_predictions_per_seq
,
is_training
=
True
)
estimator
.
train
(
input_fn
=
train_input_fn
,
max_steps
=
args
.
num_train_steps
)
if
args
.
do_eval
:
tf
.
logging
.
info
(
"***** Running evaluation *****"
)
tf
.
logging
.
info
(
" Batch size = %d"
,
args
.
eval_batch_size
)
eval_input_fn
=
input_fn_builder
(
input_files
=
input_files
,
max_seq_length
=
args
.
max_seq_length
,
max_predictions_per_seq
=
args
.
max_predictions_per_seq
,
is_training
=
False
)
result
=
estimator
.
evaluate
(
input_fn
=
eval_input_fn
,
steps
=
args
.
max_eval_steps
)
output_eval_file
=
os
.
path
.
join
(
args
.
output_dir
,
"eval_results.txt"
)
with
tf
.
gfile
.
GFile
(
output_eval_file
,
"w"
)
as
writer
:
tf
.
logging
.
info
(
"***** Eval results *****"
)
for
key
in
sorted
(
result
.
keys
()):
tf
.
logging
.
info
(
" %s = %s"
,
key
,
str
(
result
[
key
]))
writer
.
write
(
"%s = %s
\n
"
%
(
key
,
str
(
result
[
key
])))
if
__name__
==
"__main__"
:
tf
.
app
.
run
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment