Commit 936e813c authored by thomwolf's avatar thomwolf
Browse files

clean up examples - added squad example and test

parent 699bc7e8
import torch
from torch.nn import functional as F
from pytorch_transformers import XLNetModel, XLNetLMHeadModel, XLNetTokenizer
import logging
logging.basicConfig(level=logging.INFO)
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', attn_type='uni')
tokens = tokenizer.encode('I am very happy')
for i in range(len(tokens), 20):
mask = torch.tensor([[[0.0] * i + [1.0]]])
logits, _ = model(torch.tensor([tokens + [0]]),
# perm_mask=mask.expand(-1, i+1, -1),
target_mapping=mask,
inp_q=mask.squeeze(1))
output = torch.multinomial(F.softmax(logits[0, 0, :]), 1)
tokens.append(output.item())
print(tokenizer.decode(tokens))
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract pre-computed feature vectors from a PyTorch BERT model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import logging
import json
import re
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.modeling_bert import BertModel
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class InputExample(object):
def __init__(self, unique_id, text_a, text_b):
self.unique_id = unique_id
self.text_a = text_a
self.text_b = text_b
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
self.unique_id = unique_id
self.tokens = tokens
self.input_ids = input_ids
self.input_mask = input_mask
self.input_type_ids = input_type_ids
def convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputFeature`s."""
features = []
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > seq_length - 2:
tokens_a = tokens_a[0:(seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
input_type_ids = []
tokens.append("[CLS]")
input_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
input_type_ids.append(1)
tokens.append("[SEP]")
input_type_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(input_type_ids) == seq_length
if ex_index < 5:
logger.info("*** Example ***")
logger.info("unique_id: %s" % (example.unique_id))
logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info(
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append(
InputFeatures(
unique_id=example.unique_id,
tokens=tokens,
input_ids=input_ids,
input_mask=input_mask,
input_type_ids=input_type_ids))
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with open(input_file, "r", encoding='utf-8') as reader:
while True:
line = reader.readline()
if not line:
break
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
examples.append(
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
unique_id += 1
return examples
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_file", default=None, type=str, required=True)
parser.add_argument("--output_file", default=None, type=str, required=True)
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
## Other parameters
parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
parser.add_argument("--max_seq_length", default=128, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
"than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help = "local_rank for distributed training on gpus")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
args = parser.parse_args()
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
layer_indexes = [int(x) for x in args.layers.split(",")]
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
examples = read_examples(args.input_file)
features = convert_examples_to_features(
examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
unique_id_to_feature[feature.unique_id] = feature
model = BertModel.from_pretrained(args.bert_model)
model.to(device)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
output_device=args.local_rank)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
if args.local_rank == -1:
eval_sampler = SequentialSampler(eval_data)
else:
eval_sampler = DistributedSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
model.eval()
with open(args.output_file, "w", encoding='utf-8') as writer:
for input_ids, input_mask, example_indices in eval_dataloader:
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
all_encoder_layers = all_encoder_layers
for b, example_index in enumerate(example_indices):
feature = features[example_index.item()]
unique_id = int(feature.unique_id)
# feature = unique_id_to_feature[unique_id]
output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id
all_out_features = []
for (i, token) in enumerate(feature.tokens):
all_layers = []
for (j, layer_index) in enumerate(layer_indexes):
layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
layer_output = layer_output[b]
layers = collections.OrderedDict()
layers["index"] = layer_index
layers["values"] = [
round(x.item(), 6) for x in layer_output[i]
]
all_layers.append(layers)
out_features = collections.OrderedDict()
out_features["token"] = token
out_features["layers"] = all_layers
all_out_features.append(out_features)
output_json["features"] = all_out_features
writer.write(json.dumps(output_json) + "\n")
if __name__ == "__main__":
main()
...@@ -306,9 +306,9 @@ def main(): ...@@ -306,9 +306,9 @@ def main():
help="Set this flag if you are using an uncased model.") help="Set this flag if you are using an uncased model.")
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
help="Batch size per GPU for training.") help="Batch size per GPU/CPU for training.")
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
help="Batch size per GPU for evaluation.") help="Batch size per GPU/CPU for evaluation.")
parser.add_argument('--gradient_accumulation_steps', type=int, default=1, parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.") help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--learning_rate", default=5e-5, type=float, parser.add_argument("--learning_rate", default=5e-5, type=float,
...@@ -395,8 +395,7 @@ def main(): ...@@ -395,8 +395,7 @@ def main():
# Load pretrained model and tokenizer # Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]: if args.local_rank not in [-1, 0]:
# Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
torch.distributed.barrier()
args.model_type = "" args.model_type = ""
for key in MODEL_CLASSES: for key in MODEL_CLASSES:
...@@ -409,7 +408,7 @@ def main(): ...@@ -409,7 +408,7 @@ def main():
model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config) model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
if args.local_rank == 0: if args.local_rank == 0:
torch.distributed.barrier() torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
# Distributed and parrallel training # Distributed and parrallel training
model.to(args.device) model.to(args.device)
...@@ -422,6 +421,7 @@ def main(): ...@@ -422,6 +421,7 @@ def main():
logger.info("Training/evaluation parameters %s", args) logger.info("Training/evaluation parameters %s", args)
# Training # Training
if args.do_train: if args.do_train:
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
...@@ -450,6 +450,7 @@ def main(): ...@@ -450,6 +450,7 @@ def main():
tokenizer = tokenizer_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir)
model.to(args.device) model.to(args.device)
# Evaluation # Evaluation
results = {} results = {}
if args.do_eval and args.local_rank in [-1, 0]: if args.do_eval and args.local_rank in [-1, 0]:
...@@ -459,7 +460,7 @@ def main(): ...@@ -459,7 +460,7 @@ def main():
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
logger.info("Evaluate the following checkpoints: %s", checkpoints) logger.info("Evaluate the following checkpoints: %s", checkpoints)
for checkpoint in checkpoints: for checkpoint in checkpoints:
global_step = checkpoint.split('-')[-1] global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
model = model_class.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint)
model.to(args.device) model.to(args.device)
result = evaluate(args, model, tokenizer, prefix=global_step) result = evaluate(args, model, tokenizer, prefix=global_step)
......
#!/usr/bin/env python3
import argparse
import logging
from tqdm import trange
import torch
import torch.nn.functional as F
import numpy as np
from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
def top_k_logits(logits, k):
"""
Masks everything but the k top entries as -infinity (1e10).
Used to mask logits such that e^-infinity -> 0 won't contribute to the
sum of the denominator.
"""
if k == 0:
return logits
else:
values = torch.topk(logits, k)[0]
batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits)
def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
if start_token is None:
assert context is not None, 'Specify exactly one of start_token and context!'
context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
else:
assert context is None, 'Specify exactly one of start_token and context!'
context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
prev = context
output = context
past = None
with torch.no_grad():
for i in trange(length):
logits, past = model(prev, past=past)
logits = logits[:, -1, :] / temperature
logits = top_k_logits(logits, k=top_k)
log_probs = F.softmax(logits, dim=-1)
if sample:
prev = torch.multinomial(log_probs, num_samples=1)
else:
_, prev = torch.topk(log_probs, k=1, dim=-1)
output = torch.cat((output, prev), dim=1)
return output
def run_model():
parser = argparse.ArgumentParser()
parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--nsamples", type=int, default=1)
parser.add_argument("--batch_size", type=int, default=-1)
parser.add_argument("--length", type=int, default=-1)
parser.add_argument("--temperature", type=float, default=1.0)
parser.add_argument("--top_k", type=int, default=0)
parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
args = parser.parse_args()
print(args)
if args.batch_size == -1:
args.batch_size = 1
assert args.nsamples % args.batch_size == 0
np.random.seed(args.seed)
torch.random.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
model.to(device)
model.eval()
if args.length == -1:
args.length = model.config.n_ctx // 2
elif args.length > model.config.n_ctx:
raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
while True:
context_tokens = []
if not args.unconditional:
raw_text = input("Model prompt >>> ")
while not raw_text:
print('Prompt should not be empty!')
raw_text = input("Model prompt >>> ")
context_tokens = enc.encode(raw_text)
generated = 0
for _ in range(args.nsamples // args.batch_size):
out = sample_sequence(
model=model, length=args.length,
context=context_tokens,
start_token=None,
batch_size=args.batch_size,
temperature=args.temperature, top_k=args.top_k, device=device
)
out = out[:, len(context_tokens):].tolist()
for i in range(args.batch_size):
generated += 1
text = enc.decode(out[i])
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
print(text)
print("=" * 80)
else:
generated = 0
for _ in range(args.nsamples // args.batch_size):
out = sample_sequence(
model=model, length=args.length,
context=None,
start_token=enc.encoder['<|endoftext|>'],
batch_size=args.batch_size,
temperature=args.temperature, top_k=args.top_k, device=device
)
out = out[:,1:].tolist()
for i in range(args.batch_size):
generated += 1
text = enc.decode(out[i])
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
print(text)
print("=" * 80)
if __name__ == '__main__':
run_model()
...@@ -43,6 +43,8 @@ from pytorch_transformers import AdamW, WarmupLinearSchedule ...@@ -43,6 +43,8 @@ from pytorch_transformers import AdamW, WarmupLinearSchedule
from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
...@@ -62,29 +64,29 @@ def set_seed(args): ...@@ -62,29 +64,29 @@ def set_seed(args):
torch.cuda.manual_seed_all(args.seed) torch.cuda.manual_seed_all(args.seed)
def train(args, train_dataset, model): def train(args, train_dataset, model, tokenizer):
""" Train the model """ """ Train the model """
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
tb_writer = SummaryWriter() tb_writer = SummaryWriter()
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
if args.max_steps > 0: if args.max_steps > 0:
num_train_optimization_steps = args.max_steps t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else: else:
num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
# Prepare optimizer # Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight'] no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] ]
optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
t_total=num_train_optimization_steps, warmup=args.warmup_proportion) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
if args.fp16: if args.fp16:
try: try:
from apex import amp from apex import amp
...@@ -96,72 +98,172 @@ def train(args, train_dataset, model): ...@@ -96,72 +98,172 @@ def train(args, train_dataset, model):
logger.info("***** Running training *****") logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
logger.info(" Total batch size (distributed) = %d", args.train_batch_size * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
logger.info(" Total optimization steps = %d", num_train_optimization_steps) logger.info(" Total optimization steps = %d", t_total)
global_step = 0 global_step = 0
tr_loss, logging_loss = 0.0, 0.0 tr_loss, logging_loss = 0.0, 0.0
model.train() model.zero_grad()
optimizer.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): set_seed(args) # Added here for reproductibility (even between python 2 and 3)
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
model.train()
batch = tuple(t.to(args.device) for t in batch) batch = tuple(t.to(args.device) for t in batch)
inputs = {'input_ids': batch[0], inputs = {'input_ids': batch[0],
'attention_mask': batch[1], 'token_type_ids': batch[1] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'attention_mask': batch[2],
'labels': batch[3]} 'start_positions': batch[3],
'end_positions': batch[4]}
ouputs = model(**inputs) ouputs = model(**inputs)
loss = ouputs[0] loss = ouputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
def evalutate(args, dataset, model):
""" Evaluate the model """
if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
def load_and_cache_examples(args, tokenizer, training=True): if args.fp16:
""" Load data features from cache or dataset file. """ with amp.scale_loss(loss, optimizer) as scaled_loss:
cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
else:
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
tr_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0:
scheduler.step() # Update learning rate schedule
optimizer.step()
model.zero_grad()
global_step += 1
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
# Log metrics
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
results = evaluate(args, model, tokenizer)
for key, value in results.items():
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
logging_loss = tr_loss
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
logger.info("Saving model checkpoint to %s", output_dir)
if args.max_steps > 0 and global_step > args.max_steps:
epoch_iterator.close()
break
if args.max_steps > 0 and global_step > args.max_steps:
train_iterator.close()
break
return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, prefix=""):
dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
os.makedirs(args.output_dir)
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
# Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# Eval!
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(" Num examples = %d", len(dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
all_results = []
for batch in tqdm(eval_dataloader, desc="Evaluating"):
model.eval()
batch = tuple(t.to(args.device) for t in batch)
example_indices = batch[3]
with torch.no_grad():
inputs = {'input_ids': batch[0],
'token_type_ids': batch[1] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
'attention_mask': batch[2]}
outputs = model(**inputs)
batch_start_logits, batch_end_logits = outputs[:2]
for i, example_index in enumerate(example_indices):
start_logits = batch_start_logits[i].detach().cpu().tolist()
end_logits = batch_end_logits[i].detach().cpu().tolist()
eval_feature = features[example_index.item()]
unique_id = int(eval_feature.unique_id)
all_results.append(RawResult(unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
all_predictions = write_predictions(examples, features, all_results,
args.n_best_size, args.max_answer_length,
args.do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
args.verbose_logging, args.version_2_with_negative,
args.null_score_diff_threshold)
evaluate_options = EVAL_OPTS(data_file=args.predict_file,
pred_file=output_prediction_file,
na_prob_file=output_null_log_odds_file)
results = evaluate_on_squad(evaluate_options)
return results
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
# Load data features from cache or dataset file
input_file = args.predict_file if evaluate else args.train_file
cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
'dev' if evaluate else 'train', 'dev' if evaluate else 'train',
list(filter(None, args.model_name.split('/'))).pop(), list(filter(None, args.model_name.split('/'))).pop(),
str(args.max_seq_length), str(args.max_seq_length)))
str(task))) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
if os.path.exists(cached_features_file):
logger.info("Loading features from cached file %s", cached_features_file) logger.info("Loading features from cached file %s", cached_features_file)
features = torch.load(cached_features_file) features = torch.load(cached_features_file)
else: else:
logger.info("Creating features from dataset file at %s", args.data_dir) logger.info("Creating features from dataset file at %s", input_file)
label_list = processor.get_labels() examples = read_squad_examples(input_file=input_file,
examples = read_squad_examples(input_file=args.train_file if training else args.predict_file, is_training=not evaluate,
is_training=training, version_2_with_negative=args.version_2_with_negative)
version_2_with_negative=args.version_2_with_negative) features = convert_examples_to_features(examples=examples,
features = convert_examples_to_features( tokenizer=tokenizer,
examples=examples, max_seq_length=args.max_seq_length,
tokenizer=tokenizer, doc_stride=args.doc_stride,
max_seq_length=args.max_seq_length, max_query_length=args.max_query_length,
doc_stride=args.doc_stride, is_training=not evaluate)
max_query_length=args.max_query_length,
is_training=training)
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
logger.info("Num orig examples = %d", len(examples))
logger.info("Num split examples = %d", len(features))
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file) torch.save(features, cached_features_file)
# Convert to Tensors and build dataset # Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
if training: if evaluate:
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions)
else:
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
else:
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions)
if output_examples:
return dataset, examples, features
return dataset return dataset
...@@ -179,12 +281,17 @@ def main(): ...@@ -179,12 +281,17 @@ def main():
help="The output directory where the model checkpoints and predictions will be written.") help="The output directory where the model checkpoints and predictions will be written.")
## Other parameters ## Other parameters
parser.add_argument("--config_name", default="", type=str,
help="Pretrained config name or path if not the same as model_name")
parser.add_argument("--tokenizer_name", default="", type=str,
help="Pretrained tokenizer name or path if not the same as model_name")
parser.add_argument("--cache_dir", default="", type=str,
help="Where do you want to store the pre-trained models downloaded from s3")
parser.add_argument('--version_2_with_negative', action='store_true', parser.add_argument('--version_2_with_negative', action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.') help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
help="If null_score - best_non_null is greater than the threshold predict null.") help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument('--overwrite_output_dir', action='store_true',
help="Overwrite the content of the output directory")
parser.add_argument("--max_seq_length", default=384, type=int, parser.add_argument("--max_seq_length", default=384, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences " help="The maximum total input sequence length after WordPiece tokenization. Sequences "
...@@ -196,23 +303,33 @@ def main(): ...@@ -196,23 +303,33 @@ def main():
"be truncated to this length.") "be truncated to this length.")
parser.add_argument("--do_train", action='store_true', parser.add_argument("--do_train", action='store_true',
help="Whether to run training.") help="Whether to run training.")
parser.add_argument("--do_predict", action='store_true', parser.add_argument("--do_eval", action='store_true',
help="Whether to run eval on the dev set.") help="Whether to run eval on the dev set.")
parser.add_argument("--evaluate_during_training", action='store_true',
help="Rul evaluation during training at each logging step.")
parser.add_argument("--do_lower_case", action='store_true', parser.add_argument("--do_lower_case", action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.") help="Set this flag if you are using an uncased model.")
parser.add_argument("--train_batch_size", default=32, type=int, parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
help="Total batch size for training.") help="Batch size per GPU/CPU for training.")
parser.add_argument("--predict_batch_size", default=8, type=int, parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
help="Total batch size for predictions.") help="Batch size per GPU/CPU for evaluation.")
parser.add_argument("--learning_rate", default=5e-5, type=float, parser.add_argument("--learning_rate", default=5e-5, type=float,
help="The initial learning rate for Adam.") help="The initial learning rate for Adam.")
parser.add_argument('--gradient_accumulation_steps', type=int, default=1, parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.") help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--weight_decay", default=0.0, type=float,
help="Weight deay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float,
help="Max gradient norm.")
parser.add_argument("--num_train_epochs", default=3.0, type=float, parser.add_argument("--num_train_epochs", default=3.0, type=float,
help="Total number of training epochs to perform.") help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion", default=0.1, type=float, parser.add_argument("--max_steps", default=-1, type=int,
help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).") help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
parser.add_argument("--warmup_steps", default=0, type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument("--n_best_size", default=20, type=int, parser.add_argument("--n_best_size", default=20, type=int,
help="The total number of n-best predictions to generate in the nbest_predictions.json output file.") help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
parser.add_argument("--max_answer_length", default=30, type=int, parser.add_argument("--max_answer_length", default=30, type=int,
...@@ -222,10 +339,21 @@ def main(): ...@@ -222,10 +339,21 @@ def main():
help="If true, all of the warnings related to data processing will be printed. " help="If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.") "A number of warnings are expected for a normal SQuAD evaluation.")
parser.add_argument('--logging_steps', type=int, default=50,
help="Log every X updates steps.")
parser.add_argument('--save_steps', type=int, default=50,
help="Save checkpoint every X updates steps.")
parser.add_argument("--eval_all_checkpoints", action='store_true',
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
parser.add_argument("--no_cuda", action='store_true', parser.add_argument("--no_cuda", action='store_true',
help="Whether not to use CUDA when available") help="Whether not to use CUDA when available")
parser.add_argument('--overwrite_output_dir', action='store_true',
help="Overwrite the content of the output directory")
parser.add_argument('--overwrite_cache', action='store_true',
help="Overwrite the cached training and evaluation sets")
parser.add_argument('--seed', type=int, default=42, parser.add_argument('--seed', type=int, default=42,
help="random seed for initialization") help="random seed for initialization")
parser.add_argument("--local_rank", type=int, default=-1, parser.add_argument("--local_rank", type=int, default=-1,
help="local_rank for distributed training on gpus") help="local_rank for distributed training on gpus")
parser.add_argument('--fp16', action='store_true', parser.add_argument('--fp16', action='store_true',
...@@ -236,11 +364,11 @@ def main(): ...@@ -236,11 +364,11 @@ def main():
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
args = parser.parse_args() args = parser.parse_args()
print(args)
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
# Setup distant debugging if needed
if args.server_ip and args.server_port: if args.server_ip and args.server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd import ptvsd
...@@ -260,29 +388,31 @@ def main(): ...@@ -260,29 +388,31 @@ def main():
args.device = device args.device = device
# Setup logging # Setup logging
logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
# Setup seeds # Set seed
random.seed(args.seed) set_seed(args)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
# Load pretrained model and tokenizer # Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]: if args.local_rank not in [-1, 0]:
torch.distributed.barrier() # Make sure only 1st process in distributed training download model & vocab torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
args.model_type = args.model_name.lower().split('-')[0] args.model_type = ""
tokenizer_class = TOKENIZER_CLASSES[args.model_type] for key in MODEL_CLASSES:
model_class = MODEL_CLASSES[args.model_type] if key in args.model_name.lower():
tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case) args.model_type = key # take the first match in model types
model = model_class.from_pretrained(args.model_name, num_labels=num_labels) break
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name, do_lower_case=args.do_lower_case)
model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
if args.local_rank == 0: if args.local_rank == 0:
torch.distributed.barrier() torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
# Distributed and parrallel training # Distributed and parrallel training
model.to(args.device) model.to(args.device)
...@@ -293,199 +423,54 @@ def main(): ...@@ -293,199 +423,54 @@ def main():
elif args.n_gpu > 1: elif args.n_gpu > 1:
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
logger.info("Training/evaluation parameters %s", args)
# Training # Training
if args.do_train: if args.do_train:
if args.local_rank in [-1, 0]: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
tb_writer = SummaryWriter() global_step, tr_loss = train(args, train_dataset, model, tokenizer)
# Prepare data loader logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
train_examples = read_squad_examples(
input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
try:
with open(cached_train_features_file, "rb") as reader:
train_features = pickle.load(reader)
except:
train_features = convert_examples_to_features(
examples=train_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=True)
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
logger.info(" Saving train features into cached file %s", cached_train_features_file)
with open(cached_train_features_file, "wb") as writer:
pickle.dump(train_features, writer)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
all_start_positions, all_end_positions)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
# if args.local_rank != -1:
# num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare optimizer
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0
logger.info("***** Running training *****")
logger.info(" Num orig examples = %d", len(train_examples))
logger.info(" Num split examples = %d", len(train_features))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
model.train()
for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
if n_gpu == 1:
batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
input_ids, input_mask, segment_ids, start_positions, end_positions = batch
loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used and handles this automatically
lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
if args.local_rank in [-1, 0]:
if not args.fp16:
tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
tb_writer.add_scalar('loss', loss.item(), global_step)
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(args.output_dir)
# Load a trained model and vocabulary that you have fine-tuned
model = BertForQuestionAnswering.from_pretrained(args.output_dir) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) if args.local_rank == -1 or torch.distributed.get_rank() == 0:
# Create output directory if needed
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
os.makedirs(args.output_dir)
logger.info("Saving model checkpoint to %s", args.output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
# Good practice: save your training arguments together with the trained model # Good practice: save your training arguments together with the trained model
output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
torch.save(args, output_args_file)
else:
model = BertForQuestionAnswering.from_pretrained(args.bert_model)
model.to(device)
if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
eval_examples = read_squad_examples(
input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
eval_features = convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=False)
logger.info("***** Running predictions *****")
logger.info(" Num orig examples = %d", len(eval_examples))
logger.info(" Num split examples = %d", len(eval_features))
logger.info(" Batch size = %d", args.predict_batch_size)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
model.eval() # Load a trained model and vocabulary that you have fine-tuned
all_results = [] model = model_class.from_pretrained(args.output_dir)
logger.info("Start evaluating") tokenizer = tokenizer_class.from_pretrained(args.output_dir)
for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): model.to(args.device)
if len(all_results) % 1000 == 0:
logger.info("Processing example: %d" % (len(all_results)))
input_ids = input_ids.to(device) # Evaluation
input_mask = input_mask.to(device) results = {}
segment_ids = segment_ids.to(device) if args.do_eval and args.local_rank in [-1, 0]:
with torch.no_grad(): checkpoints = [args.output_dir]
batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) if args.eval_all_checkpoints:
for i, example_index in enumerate(example_indices): checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
start_logits = batch_start_logits[i].detach().cpu().tolist() logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
end_logits = batch_end_logits[i].detach().cpu().tolist() logger.info("Evaluate the following checkpoints: %s", checkpoints)
eval_feature = eval_features[example_index.item()] for checkpoint in checkpoints:
unique_id = int(eval_feature.unique_id) global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
all_results.append(RawResult(unique_id=unique_id, model = model_class.from_pretrained(checkpoint)
start_logits=start_logits, model.to(args.device)
end_logits=end_logits)) result = evaluate(args, model, tokenizer, prefix=global_step)
output_prediction_file = os.path.join(args.output_dir, "predictions.json") result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") results.update(result)
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") logger.info("Results: {}".format(results))
write_predictions(eval_examples, eval_features, all_results, return results
args.n_best_size, args.max_answer_length,
args.do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file, args.verbose_logging,
args.version_2_with_negative, args.null_score_diff_threshold)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -13,11 +13,12 @@ ...@@ -13,11 +13,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Finetuning a question-answering Bert model on SQuAD.""" """BERT finetuning runner."""
from __future__ import absolute_import, division, print_function from __future__ import absolute_import
import argparse import argparse
import csv
import logging import logging
import os import os
import random import random
...@@ -31,68 +32,279 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, ...@@ -31,68 +32,279 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange from tqdm import tqdm, trange
from tensorboardX import SummaryWriter from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
from pytorch_transformers.modeling_bert import BertForQuestionAnswering
from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
from pytorch_transformers.tokenization_bert import BertTokenizer from pytorch_transformers.tokenization_bert import BertTokenizer
from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
logger = logging.getLogger(__name__) class SwagExample(object):
"""A single training/test example for the SWAG dataset."""
def __init__(self,
swag_id,
context_sentence,
start_ending,
ending_0,
ending_1,
ending_2,
ending_3,
label = None):
self.swag_id = swag_id
self.context_sentence = context_sentence
self.start_ending = start_ending
self.endings = [
ending_0,
ending_1,
ending_2,
ending_3,
]
self.label = label
def __str__(self):
return self.__repr__()
def __repr__(self):
l = [
"swag_id: {}".format(self.swag_id),
"context_sentence: {}".format(self.context_sentence),
"start_ending: {}".format(self.start_ending),
"ending_0: {}".format(self.endings[0]),
"ending_1: {}".format(self.endings[1]),
"ending_2: {}".format(self.endings[2]),
"ending_3: {}".format(self.endings[3]),
]
if self.label is not None:
l.append("label: {}".format(self.label))
return ", ".join(l)
class InputFeatures(object):
def __init__(self,
example_id,
choices_features,
label
):
self.example_id = example_id
self.choices_features = [
{
'input_ids': input_ids,
'input_mask': input_mask,
'segment_ids': segment_ids
}
for _, input_ids, input_mask, segment_ids in choices_features
]
self.label = label
def read_swag_examples(input_file, is_training):
with open(input_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
if is_training and lines[0][-1] != 'label':
raise ValueError(
"For training, the input file must contain a label column."
)
examples = [
SwagExample(
swag_id = line[2],
context_sentence = line[4],
start_ending = line[5], # in the swag dataset, the
# common beginning of each
# choice is stored in "sent2".
ending_0 = line[7],
ending_1 = line[8],
ending_2 = line[9],
ending_3 = line[10],
label = int(line[11]) if is_training else None
) for line in lines[1:] # we skip the line with the column names
]
return examples
def convert_examples_to_features(examples, tokenizer, max_seq_length,
is_training):
"""Loads a data file into a list of `InputBatch`s."""
# Swag is a multiple choice task. To perform this task using Bert,
# we will use the formatting proposed in "Improving Language
# Understanding by Generative Pre-Training" and suggested by
# @jacobdevlin-google in this issue
# https://github.com/google-research/bert/issues/38.
#
# Each choice will correspond to a sample on which we run the
# inference. For a given Swag example, we will create the 4
# following inputs:
# - [CLS] context [SEP] choice_1 [SEP]
# - [CLS] context [SEP] choice_2 [SEP]
# - [CLS] context [SEP] choice_3 [SEP]
# - [CLS] context [SEP] choice_4 [SEP]
# The model will output a single value for each input. To get the
# final decision of the model, we will run a softmax over these 4
# outputs.
features = []
for example_index, example in enumerate(examples):
context_tokens = tokenizer.tokenize(example.context_sentence)
start_ending_tokens = tokenizer.tokenize(example.start_ending)
choices_features = []
for ending_index, ending in enumerate(example.endings):
# We create a copy of the context tokens in order to be
# able to shrink it according to ending_tokens
context_tokens_choice = context_tokens[:]
ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
# Modifies `context_tokens_choice` and `ending_tokens` in
# place so that the total length is less than the
# specified length. Account for [CLS], [SEP], [SEP] with
# "- 3"
_truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
choices_features.append((tokens, input_ids, input_mask, segment_ids))
label = example.label
if example_index < 5:
logger.info("*** Example ***")
logger.info("swag_id: {}".format(example.swag_id))
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
logger.info("choice: {}".format(choice_idx))
logger.info("tokens: {}".format(' '.join(tokens)))
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
if is_training:
logger.info("label: {}".format(label))
features.append(
InputFeatures(
example_id = example.swag_id,
choices_features = choices_features,
label = label
)
)
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def accuracy(out, labels):
outputs = np.argmax(out, axis=1)
return np.sum(outputs == labels)
def select_field(features, field):
return [
[
choice[field]
for choice in feature.choices_features
]
for feature in features
]
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
## Required parameters ## Required parameters
parser.add_argument("--data_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain the .csv files (or other data files) for the task.")
parser.add_argument("--bert_model", default=None, type=str, required=True, parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, " help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese.") "bert-base-multilingual-cased, bert-base-chinese.")
parser.add_argument("--output_dir", default=None, type=str, required=True, parser.add_argument("--output_dir",
help="The output directory where the model checkpoints and predictions will be written.") default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
## Other parameters ## Other parameters
parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--max_seq_length",
parser.add_argument("--predict_file", default=None, type=str, default=128,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") type=int,
parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n"
help="The maximum total input sequence length after WordPiece tokenization. Sequences " "Sequences longer than this will be truncated, and sequences shorter \n"
"longer than this will be truncated, and sequences shorter than this will be padded.") "than this will be padded.")
parser.add_argument("--doc_stride", default=128, type=int, parser.add_argument("--do_train",
help="When splitting up a long document into chunks, how much stride to take between chunks.") action='store_true',
parser.add_argument("--max_query_length", default=64, type=int, help="Whether to run training.")
help="The maximum number of tokens for the question. Questions longer than this will " parser.add_argument("--do_eval",
"be truncated to this length.") action='store_true',
parser.add_argument("--do_train", action='store_true', help="Whether to run training.") help="Whether to run eval on the dev set.")
parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case",
parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") action='store_true',
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") help="Set this flag if you are using an uncased model.")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--train_batch_size",
parser.add_argument("--num_train_epochs", default=3.0, type=float, default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--eval_batch_size",
default=8,
type=int,
help="Total batch size for eval.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.") help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion", default=0.1, type=float, parser.add_argument("--warmup_proportion",
help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " default=0.1,
"of training.") type=float,
parser.add_argument("--n_best_size", default=20, type=int, help="Proportion of training to perform linear learning rate warmup for. "
help="The total number of n-best predictions to generate in the nbest_predictions.json " "E.g., 0.1 = 10%% of training.")
"output file.")
parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.")
parser.add_argument("--verbose_logging", action='store_true',
help="If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.")
parser.add_argument("--no_cuda", parser.add_argument("--no_cuda",
action='store_true', action='store_true',
help="Whether not to use CUDA when available") help="Whether not to use CUDA when available")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed', parser.add_argument('--seed',
type=int, type=int,
default=42, default=42,
...@@ -101,41 +313,16 @@ def main(): ...@@ -101,41 +313,16 @@ def main():
type=int, type=int,
default=1, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.") help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--fp16', parser.add_argument('--fp16',
action='store_true', action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit") help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--overwrite_output_dir',
action='store_true',
help="Overwrite the content of the output directory")
parser.add_argument('--loss_scale', parser.add_argument('--loss_scale',
type=float, default=0, type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n" "0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n") "Positive power of 2: static loss scaling value.\n")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument('--null_score_diff_threshold',
type=float, default=0.0,
help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
args = parser.parse_args()
print(args)
if args.server_ip and args.server_port: args = parser.parse_args()
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd
print("Waiting for debugger attach")
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
ptvsd.wait_for_attach()
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
...@@ -146,11 +333,6 @@ def main(): ...@@ -146,11 +333,6 @@ def main():
n_gpu = 1 n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl') torch.distributed.init_process_group(backend='nccl')
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16)) device, n_gpu, bool(args.local_rank != -1), args.fp16))
...@@ -166,95 +348,68 @@ def main(): ...@@ -166,95 +348,68 @@ def main():
if n_gpu > 0: if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed) torch.cuda.manual_seed_all(args.seed)
if not args.do_train and not args.do_predict: if not args.do_train and not args.do_eval:
raise ValueError("At least one of `do_train` or `do_predict` must be True.") raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if args.do_train: if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
if not args.train_file: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
raise ValueError(
"If `do_train` is True, then `train_file` must be specified.")
if args.do_predict:
if not args.predict_file:
raise ValueError(
"If `do_predict` is True, then `predict_file` must be specified.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
raise ValueError("Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
if not os.path.exists(args.output_dir): if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir) os.makedirs(args.output_dir)
if args.local_rank not in [-1, 0]:
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
model = BertForQuestionAnswering.from_pretrained(args.bert_model)
if args.local_rank == 0:
torch.distributed.barrier()
# Prepare model
model = BertForMultipleChoice.from_pretrained(args.bert_model,
cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)),
num_choices=4)
if args.fp16: if args.fp16:
model.half() model.half()
model.to(device) model.to(device)
if args.local_rank != -1: if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, try:
device_ids=[args.local_rank], from apex.parallel import DistributedDataParallel as DDP
output_device=args.local_rank, except ImportError:
find_unused_parameters=True) raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1: elif n_gpu > 1:
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
if args.do_train: if args.do_train:
if args.local_rank in [-1, 0]:
tb_writer = SummaryWriter()
# Prepare data loader # Prepare data loader
train_examples = read_squad_examples(
input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( train_features = convert_examples_to_features(
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_examples, tokenizer, args.max_seq_length, True)
try: all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
with open(cached_train_features_file, "rb") as reader: all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
train_features = pickle.load(reader) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
except: all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
train_features = convert_examples_to_features( train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
examples=train_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=True)
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
logger.info(" Saving train features into cached file %s", cached_train_features_file)
with open(cached_train_features_file, "wb") as writer:
pickle.dump(train_features, writer)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
all_start_positions, all_end_positions)
if args.local_rank == -1: if args.local_rank == -1:
train_sampler = RandomSampler(train_data) train_sampler = RandomSampler(train_data)
else: else:
train_sampler = DistributedSampler(train_data) train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
# if args.local_rank != -1: if args.local_rank != -1:
# num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used # hack to remove pooler, which is not used
# thus it produce None grad that break apex # thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] param_optimizer = [n for n in param_optimizer]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
] ]
if args.fp16: if args.fp16:
try: try:
from apex.optimizers import FP16_Optimizer from apex.optimizers import FP16_Optimizer
...@@ -281,22 +436,29 @@ def main(): ...@@ -281,22 +436,29 @@ def main():
global_step = 0 global_step = 0
logger.info("***** Running training *****") logger.info("***** Running training *****")
logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num examples = %d", len(train_examples))
logger.info(" Num split examples = %d", len(train_features))
logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps) logger.info(" Num steps = %d", num_train_optimization_steps)
model.train() model.train()
for epoch in trange(int(args.num_train_epochs), desc="Epoch"): for _ in trange(int(args.num_train_epochs), desc="Epoch"):
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): tr_loss = 0
if n_gpu == 1: nb_tr_examples, nb_tr_steps = 0, 0
batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
input_ids, input_mask, segment_ids, start_positions, end_positions = batch batch = tuple(t.to(device) for t in batch)
loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) input_ids, input_mask, segment_ids, label_ids = batch
loss = model(input_ids, segment_ids, input_mask, label_ids)
if n_gpu > 1: if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu. loss = loss.mean() # mean() to average on multi-gpu.
if args.fp16 and args.loss_scale != 1.0:
# rescale loss for fp16 training
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
loss = loss * args.loss_scale
if args.gradient_accumulation_steps > 1: if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps loss = loss / args.gradient_accumulation_steps
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if args.fp16: if args.fp16:
optimizer.backward(loss) optimizer.backward(loss)
...@@ -305,19 +467,16 @@ def main(): ...@@ -305,19 +467,16 @@ def main():
if (step + 1) % args.gradient_accumulation_steps == 0: if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16: if args.fp16:
# modify learning rate with special warm up BERT uses # modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used and handles this automatically # if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
for param_group in optimizer.param_groups: for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step param_group['lr'] = lr_this_step
optimizer.step() optimizer.step()
optimizer.zero_grad() optimizer.zero_grad()
global_step += 1 global_step += 1
if args.local_rank in [-1, 0]:
if not args.fp16:
tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
tb_writer.add_scalar('loss', loss.item(), global_step)
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
if args.do_train:
# Save a trained model, configuration and tokenizer # Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
...@@ -330,69 +489,66 @@ def main(): ...@@ -330,69 +489,66 @@ def main():
tokenizer.save_vocabulary(args.output_dir) tokenizer.save_vocabulary(args.output_dir)
# Load a trained model and vocabulary that you have fine-tuned # Load a trained model and vocabulary that you have fine-tuned
model = BertForQuestionAnswering.from_pretrained(args.output_dir) model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4)
tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
# Good practice: save your training arguments together with the trained model
output_args_file = os.path.join(args.output_dir, 'training_args.bin')
torch.save(args, output_args_file)
else: else:
model = BertForQuestionAnswering.from_pretrained(args.bert_model) model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
model.to(device) model.to(device)
if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
eval_examples = read_squad_examples( if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
eval_features = convert_examples_to_features( eval_features = convert_examples_to_features(
examples=eval_examples, eval_examples, tokenizer, args.max_seq_length, True)
tokenizer=tokenizer, logger.info("***** Running evaluation *****")
max_seq_length=args.max_seq_length, logger.info(" Num examples = %d", len(eval_examples))
doc_stride=args.doc_stride, logger.info(" Batch size = %d", args.eval_batch_size)
max_query_length=args.max_query_length, all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
is_training=False) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
logger.info("***** Running predictions *****") all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
logger.info(" Num orig examples = %d", len(eval_examples)) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
logger.info(" Num split examples = %d", len(eval_features))
logger.info(" Batch size = %d", args.predict_batch_size)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
# Run prediction for full data # Run prediction for full data
eval_sampler = SequentialSampler(eval_data) eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
model.eval() model.eval()
all_results = [] eval_loss, eval_accuracy = 0, 0
logger.info("Start evaluating") nb_eval_steps, nb_eval_examples = 0, 0
for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
if len(all_results) % 1000 == 0:
logger.info("Processing example: %d" % (len(all_results)))
input_ids = input_ids.to(device) input_ids = input_ids.to(device)
input_mask = input_mask.to(device) input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device) segment_ids = segment_ids.to(device)
label_ids = label_ids.to(device)
with torch.no_grad(): with torch.no_grad():
batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
for i, example_index in enumerate(example_indices): logits = model(input_ids, segment_ids, input_mask)
start_logits = batch_start_logits[i].detach().cpu().tolist()
end_logits = batch_end_logits[i].detach().cpu().tolist() logits = logits.detach().cpu().numpy()
eval_feature = eval_features[example_index.item()] label_ids = label_ids.to('cpu').numpy()
unique_id = int(eval_feature.unique_id) tmp_eval_accuracy = accuracy(logits, label_ids)
all_results.append(RawResult(unique_id=unique_id,
start_logits=start_logits, eval_loss += tmp_eval_loss.mean().item()
end_logits=end_logits)) eval_accuracy += tmp_eval_accuracy
output_prediction_file = os.path.join(args.output_dir, "predictions.json")
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") nb_eval_examples += input_ids.size(0)
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") nb_eval_steps += 1
write_predictions(eval_examples, eval_features, all_results,
args.n_best_size, args.max_answer_length, eval_loss = eval_loss / nb_eval_steps
args.do_lower_case, output_prediction_file, eval_accuracy = eval_accuracy / nb_eval_examples
output_nbest_file, output_null_log_odds_file, args.verbose_logging,
args.version_2_with_negative, args.null_score_diff_threshold) result = {'eval_loss': eval_loss,
'eval_accuracy': eval_accuracy,
'global_step': global_step,
'loss': tr_loss/global_step}
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if __name__ == "__main__": if __name__ == "__main__":
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD."""
from __future__ import absolute_import, division, print_function
import argparse
import logging
import os
import random
import sys
from io import open
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from tensorboardX import SummaryWriter
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
from pytorch_transformers.modeling_xlnet import BertForQuestionAnswering
from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule
from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese.")
parser.add_argument("--output_dir", default=None, type=str, required=True,
help="The output directory where the model checkpoints and predictions will be written.")
## Other parameters
parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
parser.add_argument("--predict_file", default=None, type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
parser.add_argument("--max_seq_length", default=384, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
"longer than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--doc_stride", default=128, type=int,
help="When splitting up a long document into chunks, how much stride to take between chunks.")
parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.")
parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs", default=3.0, type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion", default=0.1, type=float,
help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
"of training.")
parser.add_argument("--n_best_size", default=20, type=int,
help="The total number of n-best predictions to generate in the nbest_predictions.json "
"output file.")
parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.")
parser.add_argument("--verbose_logging", action='store_true',
help="If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--fp16',
action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--overwrite_output_dir',
action='store_true',
help="Overwrite the content of the output directory")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument('--null_score_diff_threshold',
type=float, default=0.0,
help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
args = parser.parse_args()
print(args)
if args.server_ip and args.server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd
print("Waiting for debugger attach")
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
ptvsd.wait_for_attach()
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if not args.do_train and not args.do_predict:
raise ValueError("At least one of `do_train` or `do_predict` must be True.")
if args.do_train:
if not args.train_file:
raise ValueError(
"If `do_train` is True, then `train_file` must be specified.")
if args.do_predict:
if not args.predict_file:
raise ValueError(
"If `do_predict` is True, then `predict_file` must be specified.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
raise ValueError("Output directory {} already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
if args.local_rank not in [-1, 0]:
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
model = BertForQuestionAnswering.from_pretrained(args.bert_model)
if args.local_rank == 0:
torch.distributed.barrier()
if args.fp16:
model.half()
model.to(device)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model,
device_ids=[args.local_rank],
output_device=args.local_rank,
find_unused_parameters=True)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
if args.do_train:
if args.local_rank in [-1, 0]:
tb_writer = SummaryWriter()
# Prepare data loader
train_examples = read_squad_examples(
input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
try:
with open(cached_train_features_file, "rb") as reader:
train_features = pickle.load(reader)
except:
train_features = convert_examples_to_features(
examples=train_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=True)
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
logger.info(" Saving train features into cached file %s", cached_train_features_file)
with open(cached_train_features_file, "wb") as writer:
pickle.dump(train_features, writer)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
all_start_positions, all_end_positions)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
# if args.local_rank != -1:
# num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare optimizer
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0
logger.info("***** Running training *****")
logger.info(" Num orig examples = %d", len(train_examples))
logger.info(" Num split examples = %d", len(train_features))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
model.train()
for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
if n_gpu == 1:
batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
input_ids, input_mask, segment_ids, start_positions, end_positions = batch
loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used and handles this automatically
lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
if args.local_rank in [-1, 0]:
if not args.fp16:
tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
tb_writer.add_scalar('loss', loss.item(), global_step)
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(args.output_dir)
# Load a trained model and vocabulary that you have fine-tuned
model = BertForQuestionAnswering.from_pretrained(args.output_dir)
tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
# Good practice: save your training arguments together with the trained model
output_args_file = os.path.join(args.output_dir, 'training_args.bin')
torch.save(args, output_args_file)
else:
model = BertForQuestionAnswering.from_pretrained(args.bert_model)
model.to(device)
if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
eval_examples = read_squad_examples(
input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
eval_features = convert_examples_to_features(
examples=eval_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=False)
logger.info("***** Running predictions *****")
logger.info(" Num orig examples = %d", len(eval_examples))
logger.info(" Num split examples = %d", len(eval_features))
logger.info(" Batch size = %d", args.predict_batch_size)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
model.eval()
all_results = []
logger.info("Start evaluating")
for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
if len(all_results) % 1000 == 0:
logger.info("Processing example: %d" % (len(all_results)))
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
with torch.no_grad():
batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
for i, example_index in enumerate(example_indices):
start_logits = batch_start_logits[i].detach().cpu().tolist()
end_logits = batch_end_logits[i].detach().cpu().tolist()
eval_feature = eval_features[example_index.item()]
unique_id = int(eval_feature.unique_id)
all_results.append(RawResult(unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
output_prediction_file = os.path.join(args.output_dir, "predictions.json")
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
write_predictions(eval_examples, eval_features, all_results,
args.n_best_size, args.max_answer_length,
args.do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file, args.verbose_logging,
args.version_2_with_negative, args.null_score_diff_threshold)
if __name__ == "__main__":
main()
...@@ -28,6 +28,7 @@ except ImportError: ...@@ -28,6 +28,7 @@ except ImportError:
from mock import patch from mock import patch
import run_glue import run_glue
import run_squad
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
...@@ -64,6 +65,31 @@ class ExamplesTests(unittest.TestCase): ...@@ -64,6 +65,31 @@ class ExamplesTests(unittest.TestCase):
for value in result.values(): for value in result.values():
self.assertGreaterEqual(value, 0.75) self.assertGreaterEqual(value, 0.75)
def test_run_squad(self):
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)
testargs = ["run_squad.py",
"--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
"--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
"--model_name=bert-base-uncased",
"--output_dir=./examples/tests_samples/temp_dir",
"--max_steps=10",
"--warmup_steps=2",
"--do_train",
"--do_eval",
"--version_2_with_negative",
"--learning_rate=1e-4",
"--per_gpu_train_batch_size=2",
"--per_gpu_eval_batch_size=1",
"--overwrite_output_dir",
"--seed=42"]
model_name = "--model_name=bert-base-uncased"
with patch.object(sys, 'argv', testargs + [model_name]):
result = run_squad.main()
self.assertGreaterEqual(result['f1'], 30)
self.assertGreaterEqual(result['exact'], 30)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -2,4 +2,5 @@ ...@@ -2,4 +2,5 @@
cache* cache*
temp* temp*
!*.tsv !*.tsv
!*.json
!.gitignore !.gitignore
\ No newline at end of file
{
"version": "v2.0",
"data": [{
"title": "Normans",
"paragraphs": [{
"qas": [{
"question": "In what country is Normandy located?",
"id": "56ddde6b9a695914005b9628",
"answers": [{
"text": "France",
"answer_start": 159
}],
"is_impossible": false
}, {
"question": "When were the Normans in Normandy?",
"id": "56ddde6b9a695914005b9629",
"answers": [{
"text": "10th and 11th centuries",
"answer_start": 94
}],
"is_impossible": false
}, {
"question": "From which countries did the Norse originate?",
"id": "56ddde6b9a695914005b962a",
"answers": [{
"text": "Denmark, Iceland and Norway",
"answer_start": 256
}],
"is_impossible": false
}, {
"plausible_answers": [{
"text": "Rollo",
"answer_start": 308
}],
"question": "Who did King Charles III swear fealty to?",
"id": "5ad39d53604f3c001a3fe8d3",
"answers": [],
"is_impossible": true
}, {
"plausible_answers": [{
"text": "10th century",
"answer_start": 671
}],
"question": "When did the Frankish identity emerge?",
"id": "5ad39d53604f3c001a3fe8d4",
"answers": [],
"is_impossible": true
}],
"context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
}, {
"qas": [{
"question": "Who was the duke in the battle of Hastings?",
"id": "56dddf4066d3e219004dad5f",
"answers": [{
"text": "William the Conqueror",
"answer_start": 1022
}],
"is_impossible": false
}, {
"plausible_answers": [{
"text": "Antioch",
"answer_start": 1295
}],
"question": "What principality did William the conquerer found?",
"id": "5ad3a266604f3c001a3fea2b",
"answers": [],
"is_impossible": true
}],
"context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
}]
}, {
"title": "Computational_complexity_theory",
"paragraphs": [{
"qas": [{
"question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
"id": "56e16182e3433e1400422e28",
"answers": [{
"text": "Computational complexity theory",
"answer_start": 0
}],
"is_impossible": false
}, {
"plausible_answers": [{
"text": "algorithm",
"answer_start": 472
}],
"question": "What is a manual application of mathematical steps?",
"id": "5ad5316b5b96ef001a10ab76",
"answers": [],
"is_impossible": true
}],
"context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
}, {
"qas": [{
"question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
"id": "56e16839cd28a01900c67887",
"answers": [{
"text": "if its solution requires significant resources",
"answer_start": 46
}],
"is_impossible": false
}, {
"question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
"id": "56e16839cd28a01900c67888",
"answers": [{
"text": "mathematical models of computation",
"answer_start": 176
}],
"is_impossible": false
}, {
"question": "What are two basic primary resources used to guage complexity?",
"id": "56e16839cd28a01900c67889",
"answers": [{
"text": "time and storage",
"answer_start": 305
}],
"is_impossible": false
}, {
"plausible_answers": [{
"text": "the number of gates in a circuit",
"answer_start": 436
}],
"question": "What unit is measured to determine circuit simplicity?",
"id": "5ad532575b96ef001a10ab7f",
"answers": [],
"is_impossible": true
}, {
"plausible_answers": [{
"text": "the number of processors",
"answer_start": 502
}],
"question": "What number is used in perpendicular computing?",
"id": "5ad532575b96ef001a10ab80",
"answers": [],
"is_impossible": true
}],
"context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
}]
}]
}
\ No newline at end of file
...@@ -556,7 +556,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, ...@@ -556,7 +556,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
if len(nbest)==1: if len(nbest)==1:
nbest.insert(0, nbest.insert(0,
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
# In very rare edge cases we could have no valid predictions. So we # In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure. # just create a nonce prediction in this case to avoid failure.
if not nbest: if not nbest:
...@@ -609,6 +609,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, ...@@ -609,6 +609,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
with open(output_null_log_odds_file, "w") as writer: with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n") writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
return all_predictions
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text.""" """Project the tokenized prediction back to the original text."""
......
"""Official evaluation script for SQuAD version 2.0.
In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
"""
import argparse
import collections
import json
import numpy as np
import os
import re
import string
import sys
class EVAL_OPTS():
def __init__(self, data_file, pred_file, out_file="",
na_prob_file="na_prob.json", na_prob_thresh=1.0,
out_image_dir=None, verbose=False):
self.data_file = data_file
self.pred_file = pred_file
self.out_file = out_file
self.na_prob_file = na_prob_file
self.na_prob_thresh = na_prob_thresh
self.out_image_dir = out_image_dir
self.verbose = verbose
OPTS = None
def parse_args():
parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
parser.add_argument('--out-file', '-o', metavar='eval.json',
help='Write accuracy metrics to file (default is stdout).')
parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
help='Model estimates of probability of no answer.')
parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
help='Predict "" if no-answer probability exceeds this (default = 1.0).')
parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
help='Save precision-recall curves to directory.')
parser.add_argument('--verbose', '-v', action='store_true')
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
return parser.parse_args()
def make_qid_to_has_ans(dataset):
qid_to_has_ans = {}
for article in dataset:
for p in article['paragraphs']:
for qa in p['qas']:
qid_to_has_ans[qa['id']] = bool(qa['answers'])
return qid_to_has_ans
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
return re.sub(regex, ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s: return []
return normalize_answer(s).split()
def compute_exact(a_gold, a_pred):
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
def compute_f1(a_gold, a_pred):
gold_toks = get_tokens(a_gold)
pred_toks = get_tokens(a_pred)
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def get_raw_scores(dataset, preds):
exact_scores = {}
f1_scores = {}
for article in dataset:
for p in article['paragraphs']:
for qa in p['qas']:
qid = qa['id']
gold_answers = [a['text'] for a in qa['answers']
if normalize_answer(a['text'])]
if not gold_answers:
# For unanswerable questions, only correct answer is empty string
gold_answers = ['']
if qid not in preds:
print('Missing prediction for %s' % qid)
continue
a_pred = preds[qid]
# Take max over all gold answers
exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
return exact_scores, f1_scores
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
new_scores = {}
for qid, s in scores.items():
pred_na = na_probs[qid] > na_prob_thresh
if pred_na:
new_scores[qid] = float(not qid_to_has_ans[qid])
else:
new_scores[qid] = s
return new_scores
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
if not qid_list:
total = len(exact_scores)
return collections.OrderedDict([
('exact', 100.0 * sum(exact_scores.values()) / total),
('f1', 100.0 * sum(f1_scores.values()) / total),
('total', total),
])
else:
total = len(qid_list)
return collections.OrderedDict([
('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
('total', total),
])
def merge_eval(main_eval, new_eval, prefix):
for k in new_eval:
main_eval['%s_%s' % (prefix, k)] = new_eval[k]
def plot_pr_curve(precisions, recalls, out_image, title):
plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.title(title)
plt.savefig(out_image)
plt.clf()
def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
out_image=None, title=None):
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
true_pos = 0.0
cur_p = 1.0
cur_r = 0.0
precisions = [1.0]
recalls = [0.0]
avg_prec = 0.0
for i, qid in enumerate(qid_list):
if qid_to_has_ans[qid]:
true_pos += scores[qid]
cur_p = true_pos / float(i+1)
cur_r = true_pos / float(num_true_pos)
if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
# i.e., if we can put a threshold after this point
avg_prec += cur_p * (cur_r - recalls[-1])
precisions.append(cur_p)
recalls.append(cur_r)
if out_image:
plot_pr_curve(precisions, recalls, out_image, title)
return {'ap': 100.0 * avg_prec}
def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs,
qid_to_has_ans, out_image_dir):
if out_image_dir and not os.path.exists(out_image_dir):
os.makedirs(out_image_dir)
num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
if num_true_pos == 0:
return
pr_exact = make_precision_recall_eval(
exact_raw, na_probs, num_true_pos, qid_to_has_ans,
out_image=os.path.join(out_image_dir, 'pr_exact.png'),
title='Precision-Recall curve for Exact Match score')
pr_f1 = make_precision_recall_eval(
f1_raw, na_probs, num_true_pos, qid_to_has_ans,
out_image=os.path.join(out_image_dir, 'pr_f1.png'),
title='Precision-Recall curve for F1 score')
oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
pr_oracle = make_precision_recall_eval(
oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
merge_eval(main_eval, pr_exact, 'pr_exact')
merge_eval(main_eval, pr_f1, 'pr_f1')
merge_eval(main_eval, pr_oracle, 'pr_oracle')
def histogram_na_prob(na_probs, qid_list, image_dir, name):
if not qid_list:
return
x = [na_probs[k] for k in qid_list]
weights = np.ones_like(x) / float(len(x))
plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
plt.xlabel('Model probability of no-answer')
plt.ylabel('Proportion of dataset')
plt.title('Histogram of no-answer probability: %s' % name)
plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
plt.clf()
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
best_score = cur_score
best_thresh = 0.0
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
for i, qid in enumerate(qid_list):
if qid not in scores: continue
if qid_to_has_ans[qid]:
diff = scores[qid]
else:
if preds[qid]:
diff = -1
else:
diff = 0
cur_score += diff
if cur_score > best_score:
best_score = cur_score
best_thresh = na_probs[qid]
return 100.0 * best_score / len(scores), best_thresh
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
main_eval['best_exact'] = best_exact
main_eval['best_exact_thresh'] = exact_thresh
main_eval['best_f1'] = best_f1
main_eval['best_f1_thresh'] = f1_thresh
def main(OPTS):
with open(OPTS.data_file) as f:
dataset_json = json.load(f)
dataset = dataset_json['data']
with open(OPTS.pred_file) as f:
preds = json.load(f)
if OPTS.na_prob_file:
with open(OPTS.na_prob_file) as f:
na_probs = json.load(f)
else:
na_probs = {k: 0.0 for k in preds}
qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(dataset, preds)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
OPTS.na_prob_thresh)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
OPTS.na_prob_thresh)
out_eval = make_eval_dict(exact_thresh, f1_thresh)
if has_ans_qids:
has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
merge_eval(out_eval, has_ans_eval, 'HasAns')
if no_ans_qids:
no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
merge_eval(out_eval, no_ans_eval, 'NoAns')
if OPTS.na_prob_file:
find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
if OPTS.na_prob_file and OPTS.out_image_dir:
run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs,
qid_to_has_ans, OPTS.out_image_dir)
histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
if OPTS.out_file:
with open(OPTS.out_file, 'w') as f:
json.dump(out_eval, f)
else:
print(json.dumps(out_eval, indent=2))
return out_eval
if __name__ == '__main__':
OPTS = parse_args()
if OPTS.out_image_dir:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
main(OPTS)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment