Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
5676d6f7
"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "fbe04423b6fc5ca2b7d28e423264e50505dbdf45"
Commit
5676d6f7
authored
Nov 03, 2018
by
VictorSanh
Browse files
Remove BERT pretraining files for now
parent
8ec457d3
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
0 additions
and
889 deletions
+0
-889
create_pretraining_data_pytorch.py
create_pretraining_data_pytorch.py
+0
-429
run_pretraining_pytorch.py
run_pretraining_pytorch.py
+0
-460
No files found.
create_pretraining_data_pytorch.py
deleted
100644 → 0
View file @
8ec457d3
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
random
import
tokenization
import
tensorflow
as
tf
import
argparse
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--input_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Input raw text file (or comma-separated list of files)."
)
parser
.
add_argument
(
"--output_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Output TF example file (or comma-separated list of files)."
)
parser
.
add_argument
(
"--vocab_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The vocabulary file that the BERT model was trained on."
)
## Other parameters
parser
.
add_argument
(
"--do_lower_case"
,
default
=
True
,
action
=
'store_true'
,
help
=
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models."
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"Maximum sequence length."
)
parser
.
add_argument
(
"--max_predictions_per_seq"
,
default
=
20
,
type
=
int
,
help
=
"Maximum number of masked LM predictions per sequence."
)
parser
.
add_argument
(
"--random_seed"
,
default
=
12345
,
type
=
int
,
help
=
"Random seed for data generation."
)
parser
.
add_argument
(
"--dupe_factor"
,
default
=
10
,
type
=
int
,
help
=
"Number of times to duplicate the input data (with different masks)."
)
parser
.
add_argument
(
"--masked_lm_prob"
,
default
=
0.15
,
type
=
float
,
help
=
"Masked LM probability."
)
parser
.
add_argument
(
"--short_seq_prob"
,
default
=
0.1
,
type
=
float
,
help
=
"Probability of creating sequences which are shorter than the maximum length."
)
args
=
parser
.
parse_args
()
class
TrainingInstance
(
object
):
"""A single training instance (sentence pair)."""
def
__init__
(
self
,
tokens
,
segment_ids
,
masked_lm_positions
,
masked_lm_labels
,
is_random_next
):
self
.
tokens
=
tokens
self
.
segment_ids
=
segment_ids
self
.
is_random_next
=
is_random_next
self
.
masked_lm_positions
=
masked_lm_positions
self
.
masked_lm_labels
=
masked_lm_labels
def
__str__
(
self
):
s
=
""
s
+=
"tokens: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
tokens
]))
s
+=
"segment_ids: %s
\n
"
%
(
" "
.
join
([
str
(
x
)
for
x
in
self
.
segment_ids
]))
s
+=
"is_random_next: %s
\n
"
%
self
.
is_random_next
s
+=
"masked_lm_positions: %s
\n
"
%
(
" "
.
join
(
[
str
(
x
)
for
x
in
self
.
masked_lm_positions
]))
s
+=
"masked_lm_labels: %s
\n
"
%
(
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
self
.
masked_lm_labels
]))
s
+=
"
\n
"
return
s
def
__repr__
(
self
):
return
self
.
__str__
()
def
write_instance_to_example_files
(
instances
,
tokenizer
,
max_seq_length
,
max_predictions_per_seq
,
output_files
):
"""Create TF example files from `TrainingInstance`s."""
writers
=
[]
for
output_file
in
output_files
:
writers
.
append
(
tf
.
python_io
.
TFRecordWriter
(
output_file
))
writer_index
=
0
total_written
=
0
for
(
inst_index
,
instance
)
in
enumerate
(
instances
):
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
instance
.
tokens
)
input_mask
=
[
1
]
*
len
(
input_ids
)
segment_ids
=
list
(
instance
.
segment_ids
)
assert
len
(
input_ids
)
<=
max_seq_length
while
len
(
input_ids
)
<
max_seq_length
:
input_ids
.
append
(
0
)
input_mask
.
append
(
0
)
segment_ids
.
append
(
0
)
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
segment_ids
)
==
max_seq_length
masked_lm_positions
=
list
(
instance
.
masked_lm_positions
)
masked_lm_ids
=
tokenizer
.
convert_tokens_to_ids
(
instance
.
masked_lm_labels
)
masked_lm_weights
=
[
1.0
]
*
len
(
masked_lm_ids
)
while
len
(
masked_lm_positions
)
<
max_predictions_per_seq
:
masked_lm_positions
.
append
(
0
)
masked_lm_ids
.
append
(
0
)
masked_lm_weights
.
append
(
0.0
)
next_sentence_label
=
1
if
instance
.
is_random_next
else
0
features
=
collections
.
OrderedDict
()
features
[
"input_ids"
]
=
create_int_feature
(
input_ids
)
features
[
"input_mask"
]
=
create_int_feature
(
input_mask
)
features
[
"segment_ids"
]
=
create_int_feature
(
segment_ids
)
features
[
"masked_lm_positions"
]
=
create_int_feature
(
masked_lm_positions
)
features
[
"masked_lm_ids"
]
=
create_int_feature
(
masked_lm_ids
)
features
[
"masked_lm_weights"
]
=
create_float_feature
(
masked_lm_weights
)
features
[
"next_sentence_labels"
]
=
create_int_feature
([
next_sentence_label
])
tf_example
=
tf
.
train
.
Example
(
features
=
tf
.
train
.
Features
(
feature
=
features
))
writers
[
writer_index
].
write
(
tf_example
.
SerializeToString
())
writer_index
=
(
writer_index
+
1
)
%
len
(
writers
)
total_written
+=
1
if
inst_index
<
20
:
tf
.
logging
.
info
(
"*** Example ***"
)
tf
.
logging
.
info
(
"tokens: %s"
%
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
instance
.
tokens
]))
for
feature_name
in
features
.
keys
():
feature
=
features
[
feature_name
]
values
=
[]
if
feature
.
int64_list
.
value
:
values
=
feature
.
int64_list
.
value
elif
feature
.
float_list
.
value
:
values
=
feature
.
float_list
.
value
tf
.
logging
.
info
(
"%s: %s"
%
(
feature_name
,
" "
.
join
([
str
(
x
)
for
x
in
values
])))
for
writer
in
writers
:
writer
.
close
()
tf
.
logging
.
info
(
"Wrote %d total instances"
,
total_written
)
def
create_int_feature
(
values
):
feature
=
tf
.
train
.
Feature
(
int64_list
=
tf
.
train
.
Int64List
(
value
=
list
(
values
)))
return
feature
def
create_float_feature
(
values
):
feature
=
tf
.
train
.
Feature
(
float_list
=
tf
.
train
.
FloatList
(
value
=
list
(
values
)))
return
feature
def
create_training_instances
(
input_files
,
tokenizer
,
max_seq_length
,
dupe_factor
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
rng
):
"""Create `TrainingInstance`s from raw text."""
all_documents
=
[[]]
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.
for
input_file
in
input_files
:
with
tf
.
gfile
.
GFile
(
input_file
,
"r"
)
as
reader
:
while
True
:
line
=
tokenization
.
convert_to_unicode
(
reader
.
readline
())
if
not
line
:
break
line
=
line
.
strip
()
# Empty lines are used as document delimiters
if
not
line
:
all_documents
.
append
([])
tokens
=
tokenizer
.
tokenize
(
line
)
if
tokens
:
all_documents
[
-
1
].
append
(
tokens
)
# Remove empty documents
all_documents
=
[
x
for
x
in
all_documents
if
x
]
rng
.
shuffle
(
all_documents
)
vocab_words
=
list
(
tokenizer
.
vocab
.
keys
())
instances
=
[]
for
_
in
range
(
dupe_factor
):
for
document_index
in
range
(
len
(
all_documents
)):
instances
.
extend
(
create_instances_from_document
(
all_documents
,
document_index
,
max_seq_length
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
))
rng
.
shuffle
(
instances
)
return
instances
def
create_instances_from_document
(
all_documents
,
document_index
,
max_seq_length
,
short_seq_prob
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
):
"""Creates `TrainingInstance`s for a single document."""
document
=
all_documents
[
document_index
]
# Account for [CLS], [SEP], [SEP]
max_num_tokens
=
max_seq_length
-
3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length
=
max_num_tokens
if
rng
.
random
()
<
short_seq_prob
:
target_seq_length
=
rng
.
randint
(
2
,
max_num_tokens
)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances
=
[]
current_chunk
=
[]
current_length
=
0
i
=
0
while
i
<
len
(
document
):
segment
=
document
[
i
]
current_chunk
.
append
(
segment
)
current_length
+=
len
(
segment
)
if
i
==
len
(
document
)
-
1
or
current_length
>=
target_seq_length
:
if
current_chunk
:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end
=
1
if
len
(
current_chunk
)
>=
2
:
a_end
=
rng
.
randint
(
1
,
len
(
current_chunk
)
-
1
)
tokens_a
=
[]
for
j
in
range
(
a_end
):
tokens_a
.
extend
(
current_chunk
[
j
])
tokens_b
=
[]
# Random next
is_random_next
=
False
if
len
(
current_chunk
)
==
1
or
rng
.
random
()
<
0.5
:
is_random_next
=
True
target_b_length
=
target_seq_length
-
len
(
tokens_a
)
# This should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document
# we're processing.
for
_
in
range
(
10
):
random_document_index
=
rng
.
randint
(
0
,
len
(
all_documents
)
-
1
)
if
random_document_index
!=
document_index
:
break
random_document
=
all_documents
[
random_document_index
]
random_start
=
rng
.
randint
(
0
,
len
(
random_document
)
-
1
)
for
j
in
range
(
random_start
,
len
(
random_document
)):
tokens_b
.
extend
(
random_document
[
j
])
if
len
(
tokens_b
)
>=
target_b_length
:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments
=
len
(
current_chunk
)
-
a_end
i
-=
num_unused_segments
# Actual next
else
:
is_random_next
=
False
for
j
in
range
(
a_end
,
len
(
current_chunk
)):
tokens_b
.
extend
(
current_chunk
[
j
])
truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_num_tokens
,
rng
)
assert
len
(
tokens_a
)
>=
1
assert
len
(
tokens_b
)
>=
1
tokens
=
[]
segment_ids
=
[]
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_b
:
tokens
.
append
(
token
)
segment_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
(
tokens
,
masked_lm_positions
,
masked_lm_labels
)
=
create_masked_lm_predictions
(
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
)
instance
=
TrainingInstance
(
tokens
=
tokens
,
segment_ids
=
segment_ids
,
is_random_next
=
is_random_next
,
masked_lm_positions
=
masked_lm_positions
,
masked_lm_labels
=
masked_lm_labels
)
instances
.
append
(
instance
)
current_chunk
=
[]
current_length
=
0
i
+=
1
return
instances
def
create_masked_lm_predictions
(
tokens
,
masked_lm_prob
,
max_predictions_per_seq
,
vocab_words
,
rng
):
"""Creates the predictis for the masked LM objective."""
cand_indexes
=
[]
for
(
i
,
token
)
in
enumerate
(
tokens
):
if
token
==
"[CLS]"
or
token
==
"[SEP]"
:
continue
cand_indexes
.
append
(
i
)
rng
.
shuffle
(
cand_indexes
)
output_tokens
=
list
(
tokens
)
masked_lm
=
collections
.
namedtuple
(
"masked_lm"
,
[
"index"
,
"label"
])
# pylint: disable=invalid-name
num_to_predict
=
min
(
max_predictions_per_seq
,
max
(
1
,
int
(
round
(
len
(
tokens
)
*
masked_lm_prob
))))
masked_lms
=
[]
covered_indexes
=
set
()
for
index
in
cand_indexes
:
if
len
(
masked_lms
)
>=
num_to_predict
:
break
if
index
in
covered_indexes
:
continue
covered_indexes
.
add
(
index
)
masked_token
=
None
# 80% of the time, replace with [MASK]
if
rng
.
random
()
<
0.8
:
masked_token
=
"[MASK]"
else
:
# 10% of the time, keep original
if
rng
.
random
()
<
0.5
:
masked_token
=
tokens
[
index
]
# 10% of the time, replace with random word
else
:
masked_token
=
vocab_words
[
rng
.
randint
(
0
,
len
(
vocab_words
)
-
1
)]
output_tokens
[
index
]
=
masked_token
masked_lms
.
append
(
masked_lm
(
index
=
index
,
label
=
tokens
[
index
]))
masked_lms
=
sorted
(
masked_lms
,
key
=
lambda
x
:
x
.
index
)
masked_lm_positions
=
[]
masked_lm_labels
=
[]
for
p
in
masked_lms
:
masked_lm_positions
.
append
(
p
.
index
)
masked_lm_labels
.
append
(
p
.
label
)
return
(
output_tokens
,
masked_lm_positions
,
masked_lm_labels
)
def
truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_num_tokens
,
rng
):
"""Truncates a pair of sequences to a maximum sequence length."""
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_num_tokens
:
break
trunc_tokens
=
tokens_a
if
len
(
tokens_a
)
>
len
(
tokens_b
)
else
tokens_b
assert
len
(
trunc_tokens
)
>=
1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if
rng
.
random
()
<
0.5
:
del
trunc_tokens
[
0
]
else
:
trunc_tokens
.
pop
()
def
main
(
_
):
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
args
.
vocab_file
,
do_lower_case
=
args
.
do_lower_case
)
input_files
=
[]
for
input_pattern
in
args
.
input_file
.
split
(
","
):
input_files
.
extend
(
tf
.
gfile
.
Glob
(
input_pattern
))
tf
.
logging
.
info
(
"*** Reading from input files ***"
)
for
input_file
in
input_files
:
tf
.
logging
.
info
(
" %s"
,
input_file
)
rng
=
random
.
Random
(
args
.
random_seed
)
instances
=
create_training_instances
(
input_files
,
tokenizer
,
args
.
max_seq_length
,
args
.
dupe_factor
,
args
.
short_seq_prob
,
args
.
masked_lm_prob
,
args
.
max_predictions_per_seq
,
rng
)
output_files
=
args
.
output_file
.
split
(
","
)
tf
.
logging
.
info
(
"*** Writing to output files ***"
)
for
output_file
in
output_files
:
tf
.
logging
.
info
(
" %s"
,
output_file
)
write_instance_to_example_files
(
instances
,
tokenizer
,
args
.
max_seq_length
,
args
.
max_predictions_per_seq
,
output_files
)
if
__name__
==
"__main__"
:
tf
.
app
.
run
()
run_pretraining_pytorch.py
deleted
100644 → 0
View file @
8ec457d3
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment