Commit 4c721c6b authored by Ananya Harsh Jha's avatar Ananya Harsh Jha
Browse files

added eval time metrics for GLUE tasks

parent 043c8781
...@@ -31,6 +31,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, ...@@ -31,6 +31,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss, MSELoss
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.tokenization import BertTokenizer
...@@ -401,13 +405,17 @@ class WnliProcessor(DataProcessor): ...@@ -401,13 +405,17 @@ class WnliProcessor(DataProcessor):
return examples return examples
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer, output_mode):
"""Loads a data file into a list of `InputBatch`s.""" """Loads a data file into a list of `InputBatch`s."""
label_map = {label : i for i, label in enumerate(label_list)} label_map = {label : i for i, label in enumerate(label_list)}
features = [] features = []
for (ex_index, example) in enumerate(examples): for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
tokens_a = tokenizer.tokenize(example.text_a) tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None tokens_b = None
...@@ -463,7 +471,13 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -463,7 +471,13 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
assert len(input_mask) == max_seq_length assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length assert len(segment_ids) == max_seq_length
label_id = label_map[example.label] if output_mode == "classification":
label_id = label_map[example.label]
elif output_mode == "regression":
label_id = float(example.label)
else:
raise KeyError(output_mode)
if ex_index < 5: if ex_index < 5:
logger.info("*** Example ***") logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid)) logger.info("guid: %s" % (example.guid))
...@@ -499,9 +513,56 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length): ...@@ -499,9 +513,56 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
else: else:
tokens_b.pop() tokens_b.pop()
def accuracy(out, labels):
outputs = np.argmax(out, axis=1) def simple_accuracy(preds, labels):
return np.sum(outputs == labels) return (preds == labels).mean()
def acc_and_f1(preds, labels):
acc = simple_accuracy(preds, labels)
f1 = f1_score(y_true=labels, y_pred=preds)
return {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
def pearson_and_spearman(preds, labels):
pearson_corr = pearsonr(preds, labels)[0]
spearman_corr = spearmanr(preds, labels)[0]
return {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
def compute_metrics(task_name, preds, labels):
assert len(preds) == len(labels)
if task_name == "cola":
return {"mcc": matthews_corrcoef(labels, preds)}
elif task_name == "sst-2":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "mrpc":
return acc_and_f1(preds, labels)
elif task_name == "sts-b":
return pearson_and_spearman(preds, labels)
elif task_name == "qqp":
return acc_and_f1(preds, labels)
elif task_name == "mnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "mnli-mm":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "qnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "rte":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "wnli":
return {"acc": simple_accuracy(preds, labels)}
else:
raise KeyError(task_name)
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -605,6 +666,7 @@ def main(): ...@@ -605,6 +666,7 @@ def main():
processors = { processors = {
"cola": ColaProcessor, "cola": ColaProcessor,
"mnli": MnliProcessor, "mnli": MnliProcessor,
"mnli-mm": MnliMismatchedProcessor,
"mrpc": MrpcProcessor, "mrpc": MrpcProcessor,
"sst-2": Sst2Processor, "sst-2": Sst2Processor,
"sts-b": StsbProcessor, "sts-b": StsbProcessor,
...@@ -617,6 +679,7 @@ def main(): ...@@ -617,6 +679,7 @@ def main():
output_modes = { output_modes = {
"cola": "classification", "cola": "classification",
"mnli": "classification", "mnli": "classification",
"mnli-mm": "classification",
"mrpc": "classification", "mrpc": "classification",
"sst-2": "classification", "sst-2": "classification",
"sts-b": "regression", "sts-b": "regression",
...@@ -626,13 +689,6 @@ def main(): ...@@ -626,13 +689,6 @@ def main():
"wnli": "classification", "wnli": "classification",
} }
num_labels_task = {
"cola": 2,
"sst-2": 2,
"mnli": 3,
"mrpc": 2,
}
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count() n_gpu = torch.cuda.device_count()
...@@ -671,8 +727,10 @@ def main(): ...@@ -671,8 +727,10 @@ def main():
raise ValueError("Task not found: %s" % (task_name)) raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]() processor = processors[task_name]()
num_labels = num_labels_task[task_name] output_mode = output_modes[task_name]
label_list = processor.get_labels() label_list = processor.get_labels()
num_labels = len(label_list)
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
...@@ -689,7 +747,7 @@ def main(): ...@@ -689,7 +747,7 @@ def main():
cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))
model = BertForSequenceClassification.from_pretrained(args.bert_model, model = BertForSequenceClassification.from_pretrained(args.bert_model,
cache_dir=cache_dir, cache_dir=cache_dir,
num_labels = num_labels) num_labels=num_labels)
if args.fp16: if args.fp16:
model.half() model.half()
model.to(device) model.to(device)
...@@ -737,7 +795,7 @@ def main(): ...@@ -737,7 +795,7 @@ def main():
tr_loss = 0 tr_loss = 0
if args.do_train: if args.do_train:
train_features = convert_examples_to_features( train_features = convert_examples_to_features(
train_examples, label_list, args.max_seq_length, tokenizer) train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
logger.info("***** Running training *****") logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Batch size = %d", args.train_batch_size)
...@@ -745,7 +803,12 @@ def main(): ...@@ -745,7 +803,12 @@ def main():
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
if output_mode == "classification":
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif output_mode == "regression":
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
if args.local_rank == -1: if args.local_rank == -1:
train_sampler = RandomSampler(train_data) train_sampler = RandomSampler(train_data)
...@@ -760,7 +823,17 @@ def main(): ...@@ -760,7 +823,17 @@ def main():
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
batch = tuple(t.to(device) for t in batch) batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch input_ids, input_mask, segment_ids, label_ids = batch
loss = model(input_ids, segment_ids, input_mask, label_ids)
# define a new function to compute loss values for both output_modes
logits = model(input_ids, segment_ids, input_mask, labels=None)
if output_mode == "classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
elif output_mode == "regression":
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1), label_ids.view(-1))
if n_gpu > 1: if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu. loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1: if args.gradient_accumulation_steps > 1:
...@@ -805,22 +878,28 @@ def main(): ...@@ -805,22 +878,28 @@ def main():
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
eval_examples = processor.get_dev_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir)
eval_features = convert_examples_to_features( eval_features = convert_examples_to_features(
eval_examples, label_list, args.max_seq_length, tokenizer) eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
logger.info("***** Running evaluation *****") logger.info("***** Running evaluation *****")
logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Num examples = %d", len(eval_examples))
logger.info(" Batch size = %d", args.eval_batch_size) logger.info(" Batch size = %d", args.eval_batch_size)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
if output_mode == "classification":
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
elif output_mode == "regression":
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
# Run prediction for full data # Run prediction for full data
eval_sampler = SequentialSampler(eval_data) eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
model.eval() model.eval()
eval_loss, eval_accuracy = 0, 0 eval_loss = 0
nb_eval_steps, nb_eval_examples = 0, 0 nb_eval_steps = 0
preds = []
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
input_ids = input_ids.to(device) input_ids = input_ids.to(device)
...@@ -829,26 +908,34 @@ def main(): ...@@ -829,26 +908,34 @@ def main():
label_ids = label_ids.to(device) label_ids = label_ids.to(device)
with torch.no_grad(): with torch.no_grad():
tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask, labels=None)
logits = model(input_ids, segment_ids, input_mask)
# create eval loss and other metric required by the task
logits = logits.detach().cpu().numpy() if output_mode == "classification":
label_ids = label_ids.to('cpu').numpy() loss_fct = CrossEntropyLoss()
tmp_eval_accuracy = accuracy(logits, label_ids) tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
elif output_mode == "regression":
loss_fct = MSELoss()
tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
eval_loss += tmp_eval_loss.mean().item() eval_loss += tmp_eval_loss.mean().item()
eval_accuracy += tmp_eval_accuracy
nb_eval_examples += input_ids.size(0)
nb_eval_steps += 1 nb_eval_steps += 1
if len(preds) == 0:
preds.append(logits.detach().cpu().numpy())
else:
preds[0] = np.append(
preds[0], logits.detach().cpu().numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples preds = preds[0]
if output_mode == "classification":
preds = np.argmax(preds, axis=1)
result = compute_metrics(task_name, preds, all_label_ids.numpy())
loss = tr_loss/nb_tr_steps if args.do_train else None loss = tr_loss/nb_tr_steps if args.do_train else None
result = {'eval_loss': eval_loss,
'eval_accuracy': eval_accuracy, result['eval_loss'] = eval_loss
'global_step': global_step, result['global_step'] = global_step
'loss': loss} result['loss'] = loss
output_eval_file = os.path.join(args.output_dir, "eval_results.txt") output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer: with open(output_eval_file, "w") as writer:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment