Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
2f49360d
Commit
2f49360d
authored
Nov 01, 2018
by
Tim Rault
Browse files
Convert flags to argparse in 'extract_features_pytorch.py'
parent
04a360bb
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
390 additions
and
0 deletions
+390
-0
extract_features_pytorch.py
extract_features_pytorch.py
+390
-0
No files found.
extract_features_pytorch.py
0 → 100644
View file @
2f49360d
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract pre-computed feature vectors from BERT."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
codecs
import
collections
import
json
import
re
import
modeling
import
tokenization
import
tensorflow
as
tf
import
argparse
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--input_file"
,
default
=
None
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--vocab_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The vocabulary file that the BERT model was trained on."
)
parser
.
add_argument
(
"--output_file"
,
default
=
None
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--bert_config_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The config json file corresponding to the pre-trained BERT model. "
"This specifies the model architecture."
)
parser
.
add_argument
(
"--init_checkpoint"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Initial checkpoint (usually from a pre-trained BERT model)."
)
## Other parameters
parser
.
add_argument
(
"--layers"
,
default
=
"-1,-2,-3,-4"
,
type
=
str
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization. Sequences longer "
"than this will be truncated, and sequences shorter than this will be padded."
)
parser
.
add_argument
(
"--do_lower_case"
,
default
=
True
,
type
=
bool
,
help
=
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models."
)
parser
.
add_argument
(
"--batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Batch size for predictions."
)
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser
.
add_argument
(
"--use_tpu"
,
default
=
False
,
type
=
bool
,
help
=
"Whether to use TPU or GPU/CPU."
)
parser
.
add_argument
(
"--master"
,
default
=
None
,
type
=
str
,
help
=
"If using a TPU, the address of the master."
)
parser
.
add_argument
(
"--num_tpu_cores"
,
default
=
8
,
type
=
int
,
help
=
"Only used if `use_tpu` is True. Total number of TPU cores to use."
)
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser
.
add_argument
(
"--use_one_hot_embeddings"
,
default
=
False
,
type
=
bool
,
help
=
"If True, tf.one_hot will be used for embedding lookups, otherwise tf.nn.embedding_lookup "
"will be used. On TPUs, this should be True since it is much faster."
)
args
=
parser
.
parse_args
()
class
InputExample
(
object
):
def
__init__
(
self
,
unique_id
,
text_a
,
text_b
):
self
.
unique_id
=
unique_id
self
.
text_a
=
text_a
self
.
text_b
=
text_b
class
InputFeatures
(
object
):
"""A single set of features of data."""
def
__init__
(
self
,
unique_id
,
tokens
,
input_ids
,
input_mask
,
input_type_ids
):
self
.
unique_id
=
unique_id
self
.
tokens
=
tokens
self
.
input_ids
=
input_ids
self
.
input_mask
=
input_mask
self
.
input_type_ids
=
input_type_ids
def
input_fn_builder
(
features
,
seq_length
):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
all_unique_ids
=
[]
all_input_ids
=
[]
all_input_mask
=
[]
all_input_type_ids
=
[]
for
feature
in
features
:
all_unique_ids
.
append
(
feature
.
unique_id
)
all_input_ids
.
append
(
feature
.
input_ids
)
all_input_mask
.
append
(
feature
.
input_mask
)
all_input_type_ids
.
append
(
feature
.
input_type_ids
)
def
input_fn
(
params
):
"""The actual input function."""
batch_size
=
params
[
"batch_size"
]
num_examples
=
len
(
features
)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d
=
tf
.
data
.
Dataset
.
from_tensor_slices
({
"unique_ids"
:
tf
.
constant
(
all_unique_ids
,
shape
=
[
num_examples
],
dtype
=
tf
.
int32
),
"input_ids"
:
tf
.
constant
(
all_input_ids
,
shape
=
[
num_examples
,
seq_length
],
dtype
=
tf
.
int32
),
"input_mask"
:
tf
.
constant
(
all_input_mask
,
shape
=
[
num_examples
,
seq_length
],
dtype
=
tf
.
int32
),
"input_type_ids"
:
tf
.
constant
(
all_input_type_ids
,
shape
=
[
num_examples
,
seq_length
],
dtype
=
tf
.
int32
),
})
d
=
d
.
batch
(
batch_size
=
batch_size
,
drop_remainder
=
False
)
return
d
return
input_fn
def
model_fn_builder
(
bert_config
,
init_checkpoint
,
layer_indexes
,
use_tpu
,
use_one_hot_embeddings
):
"""Returns `model_fn` closure for TPUEstimator."""
def
model_fn
(
features
,
labels
,
mode
,
params
):
# pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
unique_ids
=
features
[
"unique_ids"
]
input_ids
=
features
[
"input_ids"
]
input_mask
=
features
[
"input_mask"
]
input_type_ids
=
features
[
"input_type_ids"
]
model
=
modeling
.
BertModel
(
config
=
bert_config
,
is_training
=
False
,
input_ids
=
input_ids
,
input_mask
=
input_mask
,
token_type_ids
=
input_type_ids
,
use_one_hot_embeddings
=
use_one_hot_embeddings
)
if
mode
!=
tf
.
estimator
.
ModeKeys
.
PREDICT
:
raise
ValueError
(
"Only PREDICT modes are supported: %s"
%
(
mode
))
tvars
=
tf
.
trainable_variables
()
scaffold_fn
=
None
(
assignment_map
,
_
)
=
modeling
.
get_assigment_map_from_checkpoint
(
tvars
,
init_checkpoint
)
if
use_tpu
:
def
tpu_scaffold
():
tf
.
train
.
init_from_checkpoint
(
init_checkpoint
,
assignment_map
)
return
tf
.
train
.
Scaffold
()
scaffold_fn
=
tpu_scaffold
else
:
tf
.
train
.
init_from_checkpoint
(
init_checkpoint
,
assignment_map
)
all_layers
=
model
.
get_all_encoder_layers
()
predictions
=
{
"unique_id"
:
unique_ids
,
}
for
(
i
,
layer_index
)
in
enumerate
(
layer_indexes
):
predictions
[
"layer_output_%d"
%
i
]
=
all_layers
[
layer_index
]
output_spec
=
tf
.
contrib
.
tpu
.
TPUEstimatorSpec
(
mode
=
mode
,
predictions
=
predictions
,
scaffold_fn
=
scaffold_fn
)
return
output_spec
return
model_fn
def
convert_examples_to_features
(
examples
,
seq_length
,
tokenizer
):
"""Loads a data file into a list of `InputBatch`s."""
features
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
tokens_a
=
tokenizer
.
tokenize
(
example
.
text_a
)
tokens_b
=
None
if
example
.
text_b
:
tokens_b
=
tokenizer
.
tokenize
(
example
.
text_b
)
if
tokens_b
:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
seq_length
-
3
)
else
:
# Account for [CLS] and [SEP] with "- 2"
if
len
(
tokens_a
)
>
seq_length
-
2
:
tokens_a
=
tokens_a
[
0
:(
seq_length
-
2
)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens
=
[]
input_type_ids
=
[]
tokens
.
append
(
"[CLS]"
)
input_type_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
input_type_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
input_type_ids
.
append
(
0
)
if
tokens_b
:
for
token
in
tokens_b
:
tokens
.
append
(
token
)
input_type_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
input_type_ids
.
append
(
1
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask
=
[
1
]
*
len
(
input_ids
)
# Zero-pad up to the sequence length.
while
len
(
input_ids
)
<
seq_length
:
input_ids
.
append
(
0
)
input_mask
.
append
(
0
)
input_type_ids
.
append
(
0
)
assert
len
(
input_ids
)
==
seq_length
assert
len
(
input_mask
)
==
seq_length
assert
len
(
input_type_ids
)
==
seq_length
if
ex_index
<
5
:
tf
.
logging
.
info
(
"*** Example ***"
)
tf
.
logging
.
info
(
"unique_id: %s"
%
(
example
.
unique_id
))
tf
.
logging
.
info
(
"tokens: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
tokens
]))
tf
.
logging
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
tf
.
logging
.
info
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
tf
.
logging
.
info
(
"input_type_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_type_ids
]))
features
.
append
(
InputFeatures
(
unique_id
=
example
.
unique_id
,
tokens
=
tokens
,
input_ids
=
input_ids
,
input_mask
=
input_mask
,
input_type_ids
=
input_type_ids
))
return
features
def
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_length
):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_length
:
break
if
len
(
tokens_a
)
>
len
(
tokens_b
):
tokens_a
.
pop
()
else
:
tokens_b
.
pop
()
def
read_examples
(
input_file
):
"""Read a list of `InputExample`s from an input file."""
examples
=
[]
unique_id
=
0
with
tf
.
gfile
.
GFile
(
input_file
,
"r"
)
as
reader
:
while
True
:
line
=
tokenization
.
convert_to_unicode
(
reader
.
readline
())
if
not
line
:
break
line
=
line
.
strip
()
text_a
=
None
text_b
=
None
m
=
re
.
match
(
r
"^(.*) \|\|\| (.*)$"
,
line
)
if
m
is
None
:
text_a
=
line
else
:
text_a
=
m
.
group
(
1
)
text_b
=
m
.
group
(
2
)
examples
.
append
(
InputExample
(
unique_id
=
unique_id
,
text_a
=
text_a
,
text_b
=
text_b
))
unique_id
+=
1
return
examples
def
main
(
_
):
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
layer_indexes
=
[
int
(
x
)
for
x
in
args
.
layers
.
split
(
","
)]
bert_config
=
modeling
.
BertConfig
.
from_json_file
(
args
.
bert_config_file
)
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
args
.
vocab_file
,
do_lower_case
=
args
.
do_lower_case
)
is_per_host
=
tf
.
contrib
.
tpu
.
InputPipelineConfig
.
PER_HOST_V2
run_config
=
tf
.
contrib
.
tpu
.
RunConfig
(
master
=
args
.
master
,
tpu_config
=
tf
.
contrib
.
tpu
.
TPUConfig
(
num_shards
=
args
.
num_tpu_cores
,
per_host_input_for_training
=
is_per_host
))
examples
=
read_examples
(
args
.
input_file
)
features
=
convert_examples_to_features
(
examples
=
examples
,
seq_length
=
args
.
max_seq_length
,
tokenizer
=
tokenizer
)
unique_id_to_feature
=
{}
for
feature
in
features
:
unique_id_to_feature
[
feature
.
unique_id
]
=
feature
model_fn
=
model_fn_builder
(
bert_config
=
bert_config
,
init_checkpoint
=
args
.
init_checkpoint
,
layer_indexes
=
layer_indexes
,
use_tpu
=
args
.
use_tpu
,
use_one_hot_embeddings
=
args
.
use_one_hot_embeddings
)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator
=
tf
.
contrib
.
tpu
.
TPUEstimator
(
use_tpu
=
args
.
use_tpu
,
model_fn
=
model_fn
,
config
=
run_config
,
predict_batch_size
=
args
.
batch_size
)
input_fn
=
input_fn_builder
(
features
=
features
,
seq_length
=
args
.
max_seq_length
)
with
codecs
.
getwriter
(
"utf-8"
)(
tf
.
gfile
.
Open
(
args
.
output_file
,
"w"
))
as
writer
:
for
result
in
estimator
.
predict
(
input_fn
,
yield_single_examples
=
True
):
unique_id
=
int
(
result
[
"unique_id"
])
feature
=
unique_id_to_feature
[
unique_id
]
output_json
=
collections
.
OrderedDict
()
output_json
[
"linex_index"
]
=
unique_id
all_features
=
[]
for
(
i
,
token
)
in
enumerate
(
feature
.
tokens
):
all_layers
=
[]
for
(
j
,
layer_index
)
in
enumerate
(
layer_indexes
):
layer_output
=
result
[
"layer_output_%d"
%
j
]
layers
=
collections
.
OrderedDict
()
layers
[
"index"
]
=
layer_index
layers
[
"values"
]
=
[
round
(
float
(
x
),
6
)
for
x
in
layer_output
[
i
:(
i
+
1
)].
flat
]
all_layers
.
append
(
layers
)
features
=
collections
.
OrderedDict
()
features
[
"token"
]
=
token
features
[
"layers"
]
=
all_layers
all_features
.
append
(
features
)
output_json
[
"features"
]
=
all_features
writer
.
write
(
json
.
dumps
(
output_json
)
+
"
\n
"
)
if
__name__
==
"__main__"
:
tf
.
app
.
run
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment