Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
ebfffa0a
Commit
ebfffa0a
authored
Nov 02, 2018
by
thomwolf
Browse files
updated extract_features
parent
9af479b3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
98 additions
and
191 deletions
+98
-191
extract_features_pytorch.py
extract_features_pytorch.py
+81
-172
run_squad_pytorch.py
run_squad_pytorch.py
+17
-19
No files found.
extract_features_pytorch.py
View file @
ebfffa0a
...
...
@@ -18,16 +18,24 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
argparse
import
codecs
import
collections
import
logging
import
json
import
re
import
modeling
import
tokenization
import
tensorflow
as
tf
import
argparse
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
from
torch.utils.data.distributed
import
DistributedSampler
from
modeling_pytorch
import
BertConfig
,
BertModel
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
parser
=
argparse
.
ArgumentParser
()
...
...
@@ -47,19 +55,14 @@ parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after WordPiece tokenization. Sequences longer "
"than this will be truncated, and sequences shorter than this will be padded."
)
parser
.
add_argument
(
"--do_lower_case"
,
default
=
True
,
type
=
bool
,
parser
.
add_argument
(
"--do_lower_case"
,
default
=
True
,
action
=
'store_true'
,
help
=
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models."
)
parser
.
add_argument
(
"--batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Batch size for predictions."
)
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser
.
add_argument
(
"--use_tpu"
,
default
=
False
,
type
=
bool
,
help
=
"Whether to use TPU or GPU/CPU."
)
parser
.
add_argument
(
"--master"
,
default
=
None
,
type
=
str
,
help
=
"If using a TPU, the address of the master."
)
parser
.
add_argument
(
"--num_tpu_cores"
,
default
=
8
,
type
=
int
,
help
=
"Only used if `use_tpu` is True. Total number of TPU cores to use."
)
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser
.
add_argument
(
"--use_one_hot_embeddings"
,
default
=
False
,
type
=
bool
,
help
=
"If True, tf.one_hot will be used for embedding lookups, otherwise tf.nn.embedding_lookup "
"will be used. On TPUs, this should be True since it is much faster."
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
args
=
parser
.
parse_args
()
...
...
@@ -83,107 +86,6 @@ class InputFeatures(object):
self
.
input_type_ids
=
input_type_ids
def
input_fn_builder
(
features
,
seq_length
):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
all_unique_ids
=
[]
all_input_ids
=
[]
all_input_mask
=
[]
all_input_type_ids
=
[]
for
feature
in
features
:
all_unique_ids
.
append
(
feature
.
unique_id
)
all_input_ids
.
append
(
feature
.
input_ids
)
all_input_mask
.
append
(
feature
.
input_mask
)
all_input_type_ids
.
append
(
feature
.
input_type_ids
)
def
input_fn
(
params
):
"""The actual input function."""
batch_size
=
params
[
"batch_size"
]
num_examples
=
len
(
features
)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d
=
tf
.
data
.
Dataset
.
from_tensor_slices
({
"unique_ids"
:
tf
.
constant
(
all_unique_ids
,
shape
=
[
num_examples
],
dtype
=
tf
.
int32
),
"input_ids"
:
tf
.
constant
(
all_input_ids
,
shape
=
[
num_examples
,
seq_length
],
dtype
=
tf
.
int32
),
"input_mask"
:
tf
.
constant
(
all_input_mask
,
shape
=
[
num_examples
,
seq_length
],
dtype
=
tf
.
int32
),
"input_type_ids"
:
tf
.
constant
(
all_input_type_ids
,
shape
=
[
num_examples
,
seq_length
],
dtype
=
tf
.
int32
),
})
d
=
d
.
batch
(
batch_size
=
batch_size
,
drop_remainder
=
False
)
return
d
return
input_fn
def
model_fn_builder
(
bert_config
,
init_checkpoint
,
layer_indexes
,
use_tpu
,
use_one_hot_embeddings
):
"""Returns `model_fn` closure for TPUEstimator."""
def
model_fn
(
features
,
labels
,
mode
,
params
):
# pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
unique_ids
=
features
[
"unique_ids"
]
input_ids
=
features
[
"input_ids"
]
input_mask
=
features
[
"input_mask"
]
input_type_ids
=
features
[
"input_type_ids"
]
model
=
modeling
.
BertModel
(
config
=
bert_config
,
is_training
=
False
,
input_ids
=
input_ids
,
input_mask
=
input_mask
,
token_type_ids
=
input_type_ids
,
use_one_hot_embeddings
=
use_one_hot_embeddings
)
if
mode
!=
tf
.
estimator
.
ModeKeys
.
PREDICT
:
raise
ValueError
(
"Only PREDICT modes are supported: %s"
%
(
mode
))
tvars
=
tf
.
trainable_variables
()
scaffold_fn
=
None
(
assignment_map
,
_
)
=
modeling
.
get_assigment_map_from_checkpoint
(
tvars
,
init_checkpoint
)
if
use_tpu
:
def
tpu_scaffold
():
tf
.
train
.
init_from_checkpoint
(
init_checkpoint
,
assignment_map
)
return
tf
.
train
.
Scaffold
()
scaffold_fn
=
tpu_scaffold
else
:
tf
.
train
.
init_from_checkpoint
(
init_checkpoint
,
assignment_map
)
all_layers
=
model
.
get_all_encoder_layers
()
predictions
=
{
"unique_id"
:
unique_ids
,
}
for
(
i
,
layer_index
)
in
enumerate
(
layer_indexes
):
predictions
[
"layer_output_%d"
%
i
]
=
all_layers
[
layer_index
]
output_spec
=
tf
.
contrib
.
tpu
.
TPUEstimatorSpec
(
mode
=
mode
,
predictions
=
predictions
,
scaffold_fn
=
scaffold_fn
)
return
output_spec
return
model_fn
def
convert_examples_to_features
(
examples
,
seq_length
,
tokenizer
):
"""Loads a data file into a list of `InputBatch`s."""
...
...
@@ -257,12 +159,12 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
assert
len
(
input_type_ids
)
==
seq_length
if
ex_index
<
5
:
tf
.
logg
ing
.
info
(
"*** Example ***"
)
tf
.
logg
ing
.
info
(
"unique_id: %s"
%
(
example
.
unique_id
))
tf
.
logg
ing
.
info
(
"tokens: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
tokens
]))
tf
.
logg
ing
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
tf
.
logg
ing
.
info
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
tf
.
logg
ing
.
info
(
logg
er
.
info
(
"*** Example ***"
)
logg
er
.
info
(
"unique_id: %s"
%
(
example
.
unique_id
))
logg
er
.
info
(
"tokens: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
tokens
]))
logg
er
.
info
(
"input_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_ids
]))
logg
er
.
info
(
"input_mask: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_mask
]))
logg
er
.
info
(
"input_type_ids: %s"
%
" "
.
join
([
str
(
x
)
for
x
in
input_type_ids
]))
features
.
append
(
...
...
@@ -296,7 +198,7 @@ def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples
=
[]
unique_id
=
0
with
tf
.
gfile
.
GFile
(
input_file
,
"r"
)
as
reader
:
with
open
(
input_file
,
"r"
)
as
reader
:
while
True
:
line
=
tokenization
.
convert_to_unicode
(
reader
.
readline
())
if
not
line
:
...
...
@@ -317,22 +219,22 @@ def read_examples(input_file):
def
main
():
tf
.
logging
.
set_verbosity
(
tf
.
logging
.
INFO
)
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
else
:
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
n_gpu
=
1
# print("Initializing the distributed backend: NCCL")
print
(
"device"
,
device
,
"n_gpu"
,
n_gpu
)
layer_indexes
=
[
int
(
x
)
for
x
in
args
.
layers
.
split
(
","
)]
bert_config
=
modeling
.
BertConfig
.
from_json_file
(
args
.
bert_config_file
)
bert_config
=
BertConfig
.
from_json_file
(
args
.
bert_config_file
)
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
args
.
vocab_file
,
do_lower_case
=
args
.
do_lower_case
)
is_per_host
=
tf
.
contrib
.
tpu
.
InputPipelineConfig
.
PER_HOST_V2
run_config
=
tf
.
contrib
.
tpu
.
RunConfig
(
master
=
args
.
master
,
tpu_config
=
tf
.
contrib
.
tpu
.
TPUConfig
(
num_shards
=
args
.
num_tpu_cores
,
per_host_input_for_training
=
is_per_host
))
examples
=
read_examples
(
args
.
input_file
)
features
=
convert_examples_to_features
(
...
...
@@ -342,48 +244,55 @@ def main():
for
feature
in
features
:
unique_id_to_feature
[
feature
.
unique_id
]
=
feature
model_fn
=
model_fn_builder
(
bert_config
=
bert_config
,
init_checkpoint
=
args
.
init_checkpoint
,
layer_indexes
=
layer_indexes
,
use_tpu
=
args
.
use_tpu
,
use_one_hot_embeddings
=
args
.
use_one_hot_embeddings
)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator
=
tf
.
contrib
.
tpu
.
TPUEstimator
(
use_tpu
=
args
.
use_tpu
,
model_fn
=
model_fn
,
config
=
run_config
,
predict_batch_size
=
args
.
batch_size
)
input_fn
=
input_fn_builder
(
features
=
features
,
seq_length
=
args
.
max_seq_length
)
with
codecs
.
getwriter
(
"utf-8"
)(
tf
.
gfile
.
Open
(
args
.
output_file
,
"w"
))
as
writer
:
for
result
in
estimator
.
predict
(
input_fn
,
yield_single_examples
=
True
):
unique_id
=
int
(
result
[
"unique_id"
])
feature
=
unique_id_to_feature
[
unique_id
]
output_json
=
collections
.
OrderedDict
()
output_json
[
"linex_index"
]
=
unique_id
all_features
=
[]
for
(
i
,
token
)
in
enumerate
(
feature
.
tokens
):
all_layers
=
[]
for
(
j
,
layer_index
)
in
enumerate
(
layer_indexes
):
layer_output
=
result
[
"layer_output_%d"
%
j
]
layers
=
collections
.
OrderedDict
()
layers
[
"index"
]
=
layer_index
layers
[
"values"
]
=
[
round
(
float
(
x
),
6
)
for
x
in
layer_output
[
i
:(
i
+
1
)].
flat
]
all_layers
.
append
(
layers
)
features
=
collections
.
OrderedDict
()
features
[
"token"
]
=
token
features
[
"layers"
]
=
all_layers
all_features
.
append
(
features
)
output_json
[
"features"
]
=
all_features
writer
.
write
(
json
.
dumps
(
output_json
)
+
"
\n
"
)
model
=
BertModel
(
bert_config
)
if
args
.
init_checkpoint
is
not
None
:
model
.
load_state_dict
(
torch
.
load
(
args
.
init_checkpoint
,
map_location
=
'cpu'
))
model
.
to
(
device
)
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
input_mask
for
f
in
features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
segment_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_example_index
=
torch
.
arange
(
all_input_ids
.
size
(
0
),
dtype
=
torch
.
long
)
eval_data
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_example_index
)
if
args
.
local_rank
==
-
1
:
eval_sampler
=
SequentialSampler
(
eval_data
)
else
:
eval_sampler
=
DistributedSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
batch_size
)
model
.
eval
()
with
open
(
args
.
output_file
,
"w"
,
encoding
=
'utf-8'
)
as
writer
:
for
input_ids
,
input_mask
,
segment_ids
,
example_indices
in
eval_dataloader
:
input_ids
=
input_ids
.
to
(
device
)
input_mask
=
input_mask
.
float
().
to
(
device
)
segment_ids
=
segment_ids
.
to
(
device
)
all_encoder_layers
,
_
=
model
(
input_ids
,
segment_ids
,
input_mask
)
for
enc_layers
,
example_index
in
zip
(
all_encoder_layers
,
example_indices
):
feature
=
features
[
example_index
.
item
()]
unique_id
=
int
(
feature
.
unique_id
)
# feature = unique_id_to_feature[unique_id]
output_json
=
collections
.
OrderedDict
()
output_json
[
"linex_index"
]
=
unique_id
all_features
=
[]
for
(
i
,
token
)
in
enumerate
(
feature
.
tokens
):
all_layers
=
[]
for
(
j
,
layer_index
)
in
enumerate
(
layer_indexes
):
layer_output
=
enc_layers
[
int
(
layer_index
)].
detach
().
cpu
().
numpy
()
layers
=
collections
.
OrderedDict
()
layers
[
"index"
]
=
layer_index
layers
[
"values"
]
=
[
round
(
float
(
x
),
6
)
for
x
in
layer_output
[
i
:(
i
+
1
)].
flat
]
all_layers
.
append
(
layers
)
features
=
collections
.
OrderedDict
()
features
[
"token"
]
=
token
features
[
"layers"
]
=
all_layers
all_features
.
append
(
features
)
output_json
[
"features"
]
=
all_features
writer
.
write
(
json
.
dumps
(
output_json
)
+
"
\n
"
)
if
__name__
==
"__main__"
:
...
...
run_squad_pytorch.py
View file @
ebfffa0a
...
...
@@ -23,8 +23,6 @@ import logging
import
json
import
math
import
os
import
modeling
import
optimization
import
tokenization
import
six
import
argparse
...
...
@@ -57,7 +55,7 @@ parser.add_argument("--predict_file", default=None, type=str,
help
=
"SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
)
parser
.
add_argument
(
"--init_checkpoint"
,
default
=
None
,
type
=
str
,
help
=
"Initial checkpoint (usually from a pre-trained BERT model)."
)
parser
.
add_argument
(
"--do_lower_case"
,
default
=
True
,
type
=
bool
,
parser
.
add_argument
(
"--do_lower_case"
,
default
=
True
,
action
=
'store_true'
,
help
=
"Whether to lower case the input text. Should be True for uncased "
"models and False for cased models."
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
384
,
type
=
int
,
...
...
@@ -68,8 +66,8 @@ parser.add_argument("--doc_stride", default=128, type=int,
parser
.
add_argument
(
"--max_query_length"
,
default
=
64
,
type
=
int
,
help
=
"The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length."
)
parser
.
add_argument
(
"--do_train"
,
default
=
False
,
type
=
bool
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_predict"
,
default
=
False
,
type
=
bool
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--do_train"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_predict"
,
default
=
False
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--train_batch_size"
,
default
=
32
,
type
=
int
,
help
=
"Total batch size for training."
)
parser
.
add_argument
(
"--predict_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Total batch size for predictions."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
...
...
@@ -87,19 +85,19 @@ parser.add_argument("--max_answer_length", default=30, type=int,
"and end predictions are not conditioned on one another."
)
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser
.
add_argument
(
"--use_tpu"
,
default
=
False
,
type
=
bool
,
help
=
"Whether to use TPU or GPU/CPU."
)
parser
.
add_argument
(
"--tpu_name"
,
default
=
None
,
type
=
str
,
help
=
"The Cloud TPU to use for training. This should be either the name used when creating the "
"Cloud TPU, or a grpc://ip.address.of.tpu:8470 url."
)
parser
.
add_argument
(
"--tpu_zone"
,
default
=
None
,
type
=
str
,
help
=
"[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
"to automatically detect the GCE project from metadata."
)
parser
.
add_argument
(
"--gcp_project"
,
default
=
None
,
type
=
str
,
help
=
"[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt "
"to automatically detect the GCE project from metadata."
)
parser
.
add_argument
(
"--master"
,
default
=
None
,
type
=
str
,
help
=
"[Optional] TensorFlow master URL."
)
parser
.
add_argument
(
"--num_tpu_cores"
,
default
=
8
,
type
=
int
,
help
=
"Only used if `use_tpu` is True. "
"Total number of TPU cores to use."
)
#
parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
#
parser.add_argument("--tpu_name", default=None, type=str,
#
help="The Cloud TPU to use for training. This should be either the name used when creating the "
#
"Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.")
#
parser.add_argument("--tpu_zone", default=None, type=str,
#
help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
#
"to automatically detect the GCE project from metadata.")
#
parser.add_argument("--gcp_project", default=None, type=str,
#
help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt "
#
"to automatically detect the GCE project from metadata.")
#
parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.")
#
parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. "
#
"Total number of TPU cores to use.")
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser
.
add_argument
(
"--verbose_logging"
,
default
=
False
,
type
=
bool
,
...
...
@@ -864,7 +862,7 @@ def main():
eval_sampler
=
SequentialSampler
(
eval_data
)
else
:
eval_sampler
=
DistributedSampler
(
eval_data
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
eval
_batch_size
)
eval_dataloader
=
DataLoader
(
eval_data
,
sampler
=
eval_sampler
,
batch_size
=
args
.
predict
_batch_size
)
model
.
eval
()
all_results
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment