Commit ebfffa0a authored by thomwolf's avatar thomwolf
Browse files

updated extract_features

parent 9af479b3
......@@ -18,16 +18,24 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import codecs
import collections
import logging
import json
import re
import modeling
import tokenization
import tensorflow as tf
import argparse
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from modeling_pytorch import BertConfig, BertModel
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
parser = argparse.ArgumentParser()
......@@ -47,19 +55,14 @@ parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
parser.add_argument("--max_seq_length", default=128, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
"than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--do_lower_case", default=True, type=bool,
parser.add_argument("--do_lower_case", default=True, action='store_true',
help="Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
parser.add_argument("--master", default=None, type=str, help="If using a TPU, the address of the master.")
parser.add_argument("--num_tpu_cores", default=8, type=int,
help="Only used if `use_tpu` is True. Total number of TPU cores to use.")
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser.add_argument("--use_one_hot_embeddings", default=False, type=bool,
help="If True, tf.one_hot will be used for embedding lookups, otherwise tf.nn.embedding_lookup "
"will be used. On TPUs, this should be True since it is much faster.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help = "local_rank for distributed training on gpus")
args = parser.parse_args()
......@@ -83,107 +86,6 @@ class InputFeatures(object):
self.input_type_ids = input_type_ids
def input_fn_builder(features, seq_length):
"""Creates an `input_fn` closure to be passed to TPUEstimator."""
all_unique_ids = []
all_input_ids = []
all_input_mask = []
all_input_type_ids = []
for feature in features:
all_unique_ids.append(feature.unique_id)
all_input_ids.append(feature.input_ids)
all_input_mask.append(feature.input_mask)
all_input_type_ids.append(feature.input_type_ids)
def input_fn(params):
"""The actual input function."""
batch_size = params["batch_size"]
num_examples = len(features)
# This is for demo purposes and does NOT scale to large data sets. We do
# not use Dataset.from_generator() because that uses tf.py_func which is
# not TPU compatible. The right way to load data is with TFRecordReader.
d = tf.data.Dataset.from_tensor_slices({
"unique_ids":
tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
"input_ids":
tf.constant(
all_input_ids, shape=[num_examples, seq_length],
dtype=tf.int32),
"input_mask":
tf.constant(
all_input_mask,
shape=[num_examples, seq_length],
dtype=tf.int32),
"input_type_ids":
tf.constant(
all_input_type_ids,
shape=[num_examples, seq_length],
dtype=tf.int32),
})
d = d.batch(batch_size=batch_size, drop_remainder=False)
return d
return input_fn
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
use_one_hot_embeddings):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
unique_ids = features["unique_ids"]
input_ids = features["input_ids"]
input_mask = features["input_mask"]
input_type_ids = features["input_type_ids"]
model = modeling.BertModel(
config=bert_config,
is_training=False,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=input_type_ids,
use_one_hot_embeddings=use_one_hot_embeddings)
if mode != tf.estimator.ModeKeys.PREDICT:
raise ValueError("Only PREDICT modes are supported: %s" % (mode))
tvars = tf.trainable_variables()
scaffold_fn = None
(assignment_map, _) = modeling.get_assigment_map_from_checkpoint(
tvars, init_checkpoint)
if use_tpu:
def tpu_scaffold():
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
return tf.train.Scaffold()
scaffold_fn = tpu_scaffold
else:
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
all_layers = model.get_all_encoder_layers()
predictions = {
"unique_id": unique_ids,
}
for (i, layer_index) in enumerate(layer_indexes):
predictions["layer_output_%d" % i] = all_layers[layer_index]
output_spec = tf.contrib.tpu.TPUEstimatorSpec(
mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
return output_spec
return model_fn
def convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s."""
......@@ -257,12 +159,12 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
assert len(input_type_ids) == seq_length
if ex_index < 5:
tf.logging.info("*** Example ***")
tf.logging.info("unique_id: %s" % (example.unique_id))
tf.logging.info("tokens: %s" % " ".join([str(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info(
logger.info("*** Example ***")
logger.info("unique_id: %s" % (example.unique_id))
logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info(
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append(
......@@ -296,7 +198,7 @@ def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with tf.gfile.GFile(input_file, "r") as reader:
with open(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
......@@ -317,22 +219,22 @@ def read_examples(input_file):
def main():
tf.logging.set_verbosity(tf.logging.INFO)
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# print("Initializing the distributed backend: NCCL")
print("device", device, "n_gpu", n_gpu)
layer_indexes = [int(x) for x in args.layers.split(",")]
bert_config = modeling.BertConfig.from_json_file(args.bert_config_file)
bert_config = BertConfig.from_json_file(args.bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
master=args.master,
tpu_config=tf.contrib.tpu.TPUConfig(
num_shards=args.num_tpu_cores,
per_host_input_for_training=is_per_host))
examples = read_examples(args.input_file)
features = convert_examples_to_features(
......@@ -342,36 +244,43 @@ def main():
for feature in features:
unique_id_to_feature[feature.unique_id] = feature
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=args.init_checkpoint,
layer_indexes=layer_indexes,
use_tpu=args.use_tpu,
use_one_hot_embeddings=args.use_one_hot_embeddings)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=args.use_tpu,
model_fn=model_fn,
config=run_config,
predict_batch_size=args.batch_size)
input_fn = input_fn_builder(
features=features, seq_length=args.max_seq_length)
with codecs.getwriter("utf-8")(tf.gfile.Open(args.output_file,
"w")) as writer:
for result in estimator.predict(input_fn, yield_single_examples=True):
unique_id = int(result["unique_id"])
feature = unique_id_to_feature[unique_id]
model = BertModel(bert_config)
if args.init_checkpoint is not None:
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
model.to(device)
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
if args.local_rank == -1:
eval_sampler = SequentialSampler(eval_data)
else:
eval_sampler = DistributedSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
model.eval()
with open(args.output_file, "w", encoding='utf-8') as writer:
for input_ids, input_mask, segment_ids, example_indices in eval_dataloader:
input_ids = input_ids.to(device)
input_mask = input_mask.float().to(device)
segment_ids = segment_ids.to(device)
all_encoder_layers, _ = model(input_ids, segment_ids, input_mask)
for enc_layers, example_index in zip(all_encoder_layers, example_indices):
feature = features[example_index.item()]
unique_id = int(feature.unique_id)
# feature = unique_id_to_feature[unique_id]
output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id
all_features = []
for (i, token) in enumerate(feature.tokens):
all_layers = []
for (j, layer_index) in enumerate(layer_indexes):
layer_output = result["layer_output_%d" % j]
layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()
layers = collections.OrderedDict()
layers["index"] = layer_index
layers["values"] = [
......
......@@ -23,8 +23,6 @@ import logging
import json
import math
import os
import modeling
import optimization
import tokenization
import six
import argparse
......@@ -57,7 +55,7 @@ parser.add_argument("--predict_file", default=None, type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
parser.add_argument("--init_checkpoint", default=None, type=str,
help="Initial checkpoint (usually from a pre-trained BERT model).")
parser.add_argument("--do_lower_case", default=True, type=bool,
parser.add_argument("--do_lower_case", default=True, action='store_true',
help="Whether to lower case the input text. Should be True for uncased "
"models and False for cased models.")
parser.add_argument("--max_seq_length", default=384, type=int,
......@@ -68,8 +66,8 @@ parser.add_argument("--doc_stride", default=128, type=int,
parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.")
parser.add_argument("--do_train", default=False, type=bool, help="Whether to run training.")
parser.add_argument("--do_predict", default=False, type=bool, help="Whether to run eval on the dev set.")
parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
......@@ -87,19 +85,19 @@ parser.add_argument("--max_answer_length", default=30, type=int,
"and end predictions are not conditioned on one another.")
### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
parser.add_argument("--tpu_name", default=None, type=str,
help="The Cloud TPU to use for training. This should be either the name used when creating the "
"Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.")
parser.add_argument("--tpu_zone", default=None, type=str,
help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
"to automatically detect the GCE project from metadata.")
parser.add_argument("--gcp_project", default=None, type=str,
help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt "
"to automatically detect the GCE project from metadata.")
parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.")
parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. "
"Total number of TPU cores to use.")
# parser.add_argument("--use_tpu", default=False, type=bool, help="Whether to use TPU or GPU/CPU.")
# parser.add_argument("--tpu_name", default=None, type=str,
# help="The Cloud TPU to use for training. This should be either the name used when creating the "
# "Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.")
# parser.add_argument("--tpu_zone", default=None, type=str,
# help="[Optional] GCE zone where the Cloud TPU is located in. If not specified, we will attempt "
# "to automatically detect the GCE project from metadata.")
# parser.add_argument("--gcp_project", default=None, type=str,
# help="[Optional] Project name for the Cloud TPU-enabled project. If not specified, we will attempt "
# "to automatically detect the GCE project from metadata.")
# parser.add_argument("--master", default=None, type=str, help="[Optional] TensorFlow master URL.")
# parser.add_argument("--num_tpu_cores", default=8, type=int, help="Only used if `use_tpu` is True. "
# "Total number of TPU cores to use.")
### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###
parser.add_argument("--verbose_logging", default=False, type=bool,
......@@ -864,7 +862,7 @@ def main():
eval_sampler = SequentialSampler(eval_data)
else:
eval_sampler = DistributedSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
model.eval()
all_results = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment