Commit ebfffa0a authored by thomwolf's avatar thomwolf
Browse files

updated extract_features

parent 9af479b3
...@@ -18,16 +18,24 @@ from __future__ import absolute_import ...@@ -18,16 +18,24 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse
import codecs import codecs
import collections import collections
import logging
import json import json
import re import re
import modeling
import tokenization import tokenization
import tensorflow as tf
import argparse from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from modeling_pytorch import BertConfig, BertModel
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -47,19 +55,14 @@ parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) ...@@ -47,19 +55,14 @@ parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
parser.add_argument("--max_seq_length", default=128, type=int, parser.add_argument("--max_seq_length", default=128, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
"than this will be truncated, and sequences shorter than this will be padded.") "than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--do_lower_case", default=True, type=bool, parser.add_argument("--do_lower_case", default=True, action='store_true',
help="Whether to lower case the input text. Should be True for uncased " help="Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.") "models and False for cased models.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ### parser.add_argument("--local_rank",
parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.") type=int,
parser.add_argument("--master", default=None, type=str, help="If using a TPU, the address of the master.") default=-1,
parser.add_argument("--num_tpu_cores", default=8, type=int, help = "local_rank for distributed training on gpus")
help="Only used if `use_tpu` is True. Total number of TPU cores to use.")
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser.add_argument("--use_one_hot_embeddings", default=False, type=bool,
help="If True, tf.one_hot will be used for embedding lookups, otherwise tf.nn.embedding_lookup "
"will be used. On TPUs, this should be True since it is much faster.")
args = parser.parse_args() args = parser.parse_args()
...@@ -83,107 +86,6 @@ class InputFeatures(object): ...@@ -83,107 +86,6 @@ class InputFeatures(object):
self.input_type_ids = input_type_ids self.input_type_ids = input_type_ids
def input_fn_builder(features, seq_length):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
all_unique_ids = []
all_input_ids = []
all_input_mask = []
all_input_type_ids = []
for feature in features:
all_unique_ids.append(feature.unique_id)
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
all_input_type_ids.append(feature.input_type_ids)
def input_fn(params):
"""The actual input function."""
batch_size = params["batch_size"]
num_examples = len(features)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d = tf.data.Dataset.from_tensor_slices({
"unique_ids":
tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
"input_ids":
tf.constant(
all_input_ids, shape=[num_examples, seq_length],
dtype=tf.int32),
"input_mask":
tf.constant(
all_input_mask,
shape=[num_examples, seq_length],
dtype=tf.int32),
"input_type_ids":
tf.constant(
all_input_type_ids,
shape=[num_examples, seq_length],
dtype=tf.int32),
})
d = d.batch(batch_size=batch_size, drop_remainder=False)
return d
return input_fn
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
use_one_hot_embeddings):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
unique_ids = features["unique_ids"]
input_ids = features["input_ids"]
input_mask = features["input_mask"]
input_type_ids = features["input_type_ids"]
model = modeling.BertModel(
config=bert_config,
is_training=False,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=input_type_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
if mode != tf.estimator.ModeKeys.PREDICT:
raise ValueError("Only PREDICT modes are supported: %s" % (mode))
tvars = tf.trainable_variables()
scaffold_fn = None
(assignment_map, _) = modeling.get_assigment_map_from_checkpoint(
tvars, init_checkpoint)
if use_tpu:
def tpu_scaffold():
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
return tf.train.Scaffold()
scaffold_fn = tpu_scaffold
else:
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
all_layers = model.get_all_encoder_layers()
predictions = {
"unique_id": unique_ids,
}
for (i, layer_index) in enumerate(layer_indexes):
predictions["layer_output_%d" % i] = all_layers[layer_index]
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
return output_spec
return model_fn
def convert_examples_to_features(examples, seq_length, tokenizer): def convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s.""" """Loads a data file into a list of `InputBatch`s."""
...@@ -257,12 +159,12 @@ def convert_examples_to_features(examples, seq_length, tokenizer): ...@@ -257,12 +159,12 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
assert len(input_type_ids) == seq_length assert len(input_type_ids) == seq_length
if ex_index < 5: if ex_index < 5:
tf.logging.info("*** Example ***") logger.info("*** Example ***")
tf.logging.info("unique_id: %s" % (example.unique_id)) logger.info("unique_id: %s" % (example.unique_id))
tf.logging.info("tokens: %s" % " ".join([str(x) for x in tokens])) logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info( logger.info(
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append( features.append(
...@@ -296,7 +198,7 @@ def read_examples(input_file): ...@@ -296,7 +198,7 @@ def read_examples(input_file):
"""Read a list of `InputExample`s from an input file.""" """Read a list of `InputExample`s from an input file."""
examples = [] examples = []
unique_id = 0 unique_id = 0
with tf.gfile.GFile(input_file, "r") as reader: with open(input_file, "r") as reader:
while True: while True:
line = tokenization.convert_to_unicode(reader.readline()) line = tokenization.convert_to_unicode(reader.readline())
if not line: if not line:
...@@ -317,22 +219,22 @@ def read_examples(input_file): ...@@ -317,22 +219,22 @@ def read_examples(input_file):
def main(): def main():
tf.logging.set_verbosity(tf.logging.INFO) if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# print("Initializing the distributed backend: NCCL")
print("device", device, "n_gpu", n_gpu)
layer_indexes = [int(x) for x in args.layers.split(",")] layer_indexes = [int(x) for x in args.layers.split(",")]
bert_config = modeling.BertConfig.from_json_file(args.bert_config_file) bert_config = BertConfig.from_json_file(args.bert_config_file)
tokenizer = tokenization.FullTokenizer( tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
master=args.master,
tpu_config=tf.contrib.tpu.TPUConfig(
num_shards=args.num_tpu_cores,
per_host_input_for_training=is_per_host))
examples = read_examples(args.input_file) examples = read_examples(args.input_file)
features = convert_examples_to_features( features = convert_examples_to_features(
...@@ -342,48 +244,55 @@ def main(): ...@@ -342,48 +244,55 @@ def main():
for feature in features: for feature in features:
unique_id_to_feature[feature.unique_id] = feature unique_id_to_feature[feature.unique_id] = feature
model_fn = model_fn_builder( model = BertModel(bert_config)
bert_config=bert_config, if args.init_checkpoint is not None:
init_checkpoint=args.init_checkpoint, model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
layer_indexes=layer_indexes, model.to(device)
use_tpu=args.use_tpu,
use_one_hot_embeddings=args.use_one_hot_embeddings) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
# If TPU is not available, this will fall back to normal Estimator on CPU all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
# or GPU. all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=args.use_tpu, eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
model_fn=model_fn, if args.local_rank == -1:
config=run_config, eval_sampler = SequentialSampler(eval_data)
predict_batch_size=args.batch_size) else:
eval_sampler = DistributedSampler(eval_data)
input_fn = input_fn_builder( eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
features=features, seq_length=args.max_seq_length)
model.eval()
with codecs.getwriter("utf-8")(tf.gfile.Open(args.output_file, with open(args.output_file, "w", encoding='utf-8') as writer:
"w")) as writer: for input_ids, input_mask, segment_ids, example_indices in eval_dataloader:
for result in estimator.predict(input_fn, yield_single_examples=True): input_ids = input_ids.to(device)
unique_id = int(result["unique_id"]) input_mask = input_mask.float().to(device)
feature = unique_id_to_feature[unique_id] segment_ids = segment_ids.to(device)
output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id all_encoder_layers, _ = model(input_ids, segment_ids, input_mask)
all_features = []
for (i, token) in enumerate(feature.tokens): for enc_layers, example_index in zip(all_encoder_layers, example_indices):
all_layers = [] feature = features[example_index.item()]
for (j, layer_index) in enumerate(layer_indexes): unique_id = int(feature.unique_id)
layer_output = result["layer_output_%d" % j] # feature = unique_id_to_feature[unique_id]
layers = collections.OrderedDict() output_json = collections.OrderedDict()
layers["index"] = layer_index output_json["linex_index"] = unique_id
layers["values"] = [ all_features = []
round(float(x), 6) for x in layer_output[i:(i + 1)].flat for (i, token) in enumerate(feature.tokens):
] all_layers = []
all_layers.append(layers) for (j, layer_index) in enumerate(layer_indexes):
features = collections.OrderedDict() layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()
features["token"] = token layers = collections.OrderedDict()
features["layers"] = all_layers layers["index"] = layer_index
all_features.append(features) layers["values"] = [
output_json["features"] = all_features round(float(x), 6) for x in layer_output[i:(i + 1)].flat
writer.write(json.dumps(output_json) + "\n") ]
all_layers.append(layers)
features = collections.OrderedDict()
features["token"] = token
features["layers"] = all_layers
all_features.append(features)
output_json["features"] = all_features
writer.write(json.dumps(output_json) + "\n")
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -23,8 +23,6 @@ import logging ...@@ -23,8 +23,6 @@ import logging
import json import json
import math import math
import os import os
import modeling
import optimization
import tokenization import tokenization
import six import six
import argparse import argparse
...@@ -57,7 +55,7 @@ parser.add_argument("--predict_file", default=None, type=str, ...@@ -57,7 +55,7 @@ parser.add_argument("--predict_file", default=None, type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
parser.add_argument("--init_checkpoint", default=None, type=str, parser.add_argument("--init_checkpoint", default=None, type=str,
help="Initial checkpoint (usually from a pre-trained BERT model).") help="Initial checkpoint (usually from a pre-trained BERT model).")
parser.add_argument("--do_lower_case", default=True, type=bool, parser.add_argument("--do_lower_case", default=True, action='store_true',
help="Whether to lower case the input text. Should be True for uncased " help="Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.") "models and False for cased models.")
parser.add_argument("--max_seq_length", default=384, type=int, parser.add_argument("--max_seq_length", default=384, type=int,
...@@ -68,8 +66,8 @@ parser.add_argument("--doc_stride", default=128, type=int, ...@@ -68,8 +66,8 @@ parser.add_argument("--doc_stride", default=128, type=int,
parser.add_argument("--max_query_length", default=64, type=int, parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will " help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.") "be truncated to this length.")
parser.add_argument("--do_train", default=False, type=bool, help="Whether to run training.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
parser.add_argument("--do_predict", default=False, type=bool, help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
...@@ -87,19 +85,19 @@ parser.add_argument("--max_answer_length", default=30, type=int, ...@@ -87,19 +85,19 @@ parser.add_argument("--max_answer_length", default=30, type=int,
"and end predictions are not conditioned on one another.") "and end predictions are not conditioned on one another.")
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ### ### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.") # parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
parser.add_argument("--tpu_name", default=None, type=str, # parser.add_argument("--tpu_name", default=None, type=str,
help="The Cloud TPU to use for training. This should be either the name used when creating the " # help="The Cloud TPU to use for training. This should be either the name used when creating the "
"Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.") # "Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.")
parser.add_argument("--tpu_zone", default=None, type=str, # parser.add_argument("--tpu_zone", default=None, type=str,
help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt " # help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
"to automatically detect the GCE project from metadata.") # "to automatically detect the GCE project from metadata.")
parser.add_argument("--gcp_project", default=None, type=str, # parser.add_argument("--gcp_project", default=None, type=str,
help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt " # help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt "
"to automatically detect the GCE project from metadata.") # "to automatically detect the GCE project from metadata.")
parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.") # parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.")
parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. " # parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. "
"Total number of TPU cores to use.") # "Total number of TPU cores to use.")
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ### ### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser.add_argument("--verbose_logging", default=False, type=bool, parser.add_argument("--verbose_logging", default=False, type=bool,
...@@ -864,7 +862,7 @@ def main(): ...@@ -864,7 +862,7 @@ def main():
eval_sampler = SequentialSampler(eval_data) eval_sampler = SequentialSampler(eval_data)
else: else:
eval_sampler = DistributedSampler(eval_data) eval_sampler = DistributedSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
model.eval() model.eval()
all_results = [] all_results = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment