Unverified Commit d5712f7c authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into check-link-validity

parents f230d91b 9c58b236
......@@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase):
logger.addHandler(stream_handler)
testargs = ["run_squad.py",
"--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
"--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
"--data_dir=./examples/tests_samples/SQUAD",
"--model_name=bert-base-uncased",
"--output_dir=./examples/tests_samples/temp_dir",
"--max_steps=10",
......
{
"version": "v2.0",
"data": [{
"title": "Normans",
"paragraphs": [{
"qas": [{
"question": "In what country is Normandy located?",
"id": "56ddde6b9a695914005b9628",
"answers": [{
"text": "France",
"answer_start": 159
}],
"is_impossible": false
}, {
"question": "When were the Normans in Normandy?",
"id": "56ddde6b9a695914005b9629",
"answers": [{
"text": "10th and 11th centuries",
"answer_start": 94
}],
"is_impossible": false
}, {
"question": "From which countries did the Norse originate?",
"id": "56ddde6b9a695914005b962a",
"answers": [{
"text": "Denmark, Iceland and Norway",
"answer_start": 256
}],
"is_impossible": false
}, {
"plausible_answers": [{
"text": "Rollo",
"answer_start": 308
}],
"question": "Who did King Charles III swear fealty to?",
"id": "5ad39d53604f3c001a3fe8d3",
"answers": [],
"is_impossible": true
}, {
"plausible_answers": [{
"text": "10th century",
"answer_start": 671
}],
"question": "When did the Frankish identity emerge?",
"id": "5ad39d53604f3c001a3fe8d4",
"answers": [],
"is_impossible": true
}],
"context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
}, {
"qas": [{
"question": "Who was the duke in the battle of Hastings?",
"id": "56dddf4066d3e219004dad5f",
"answers": [{
"text": "William the Conqueror",
"answer_start": 1022
}],
"is_impossible": false
}, {
"plausible_answers": [{
"text": "Antioch",
"answer_start": 1295
}],
"question": "What principality did William the conquerer found?",
"id": "5ad3a266604f3c001a3fea2b",
"answers": [],
"is_impossible": true
}],
"context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
}]
}, {
"title": "Computational_complexity_theory",
"paragraphs": [{
"qas": [{
"question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
"id": "56e16182e3433e1400422e28",
"answers": [{
"text": "Computational complexity theory",
"answer_start": 0
}],
"is_impossible": false
}, {
"plausible_answers": [{
"text": "algorithm",
"answer_start": 472
}],
"question": "What is a manual application of mathematical steps?",
"id": "5ad5316b5b96ef001a10ab76",
"answers": [],
"is_impossible": true
}],
"context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
}, {
"qas": [{
"question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
"id": "56e16839cd28a01900c67887",
"answers": [{
"text": "if its solution requires significant resources",
"answer_start": 46
}],
"is_impossible": false
}, {
"question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
"id": "56e16839cd28a01900c67888",
"answers": [{
"text": "mathematical models of computation",
"answer_start": 176
}],
"is_impossible": false
}, {
"question": "What are two basic primary resources used to guage complexity?",
"id": "56e16839cd28a01900c67889",
"answers": [{
"text": "time and storage",
"answer_start": 305
}],
"is_impossible": false
}, {
"plausible_answers": [{
"text": "the number of gates in a circuit",
"answer_start": 436
}],
"question": "What unit is measured to determine circuit simplicity?",
"id": "5ad532575b96ef001a10ab7f",
"answers": [],
"is_impossible": true
}, {
"plausible_answers": [{
"text": "the number of processors",
"answer_start": 502
}],
"question": "What number is used in perpendicular computing?",
"id": "5ad532575b96ef001a10ab80",
"answers": [],
"is_impossible": true
}],
"context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
}]
}]
}
\ No newline at end of file
""" Official evaluation script for SQuAD version 2.0.
Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
"""
import argparse
import collections
import json
import numpy as np
import os
import re
import string
import sys
class EVAL_OPTS():
def __init__(self, data_file, pred_file, out_file="",
na_prob_file="na_prob.json", na_prob_thresh=1.0,
out_image_dir=None, verbose=False):
self.data_file = data_file
self.pred_file = pred_file
self.out_file = out_file
self.na_prob_file = na_prob_file
self.na_prob_thresh = na_prob_thresh
self.out_image_dir = out_image_dir
self.verbose = verbose
OPTS = None
def parse_args():
parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
parser.add_argument('--out-file', '-o', metavar='eval.json',
help='Write accuracy metrics to file (default is stdout).')
parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
help='Model estimates of probability of no answer.')
parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
help='Predict "" if no-answer probability exceeds this (default = 1.0).')
parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
help='Save precision-recall curves to directory.')
parser.add_argument('--verbose', '-v', action='store_true')
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
return parser.parse_args()
def make_qid_to_has_ans(dataset):
qid_to_has_ans = {}
for article in dataset:
for p in article['paragraphs']:
for qa in p['qas']:
qid_to_has_ans[qa['id']] = bool(qa['answers'])
return qid_to_has_ans
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
return re.sub(regex, ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s: return []
return normalize_answer(s).split()
def compute_exact(a_gold, a_pred):
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
def compute_f1(a_gold, a_pred):
gold_toks = get_tokens(a_gold)
pred_toks = get_tokens(a_pred)
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def get_raw_scores(dataset, preds):
exact_scores = {}
f1_scores = {}
for article in dataset:
for p in article['paragraphs']:
for qa in p['qas']:
qid = qa['id']
gold_answers = [a['text'] for a in qa['answers']
if normalize_answer(a['text'])]
if not gold_answers:
# For unanswerable questions, only correct answer is empty string
gold_answers = ['']
if qid not in preds:
print('Missing prediction for %s' % qid)
continue
a_pred = preds[qid]
# Take max over all gold answers
exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
return exact_scores, f1_scores
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
new_scores = {}
for qid, s in scores.items():
pred_na = na_probs[qid] > na_prob_thresh
if pred_na:
new_scores[qid] = float(not qid_to_has_ans[qid])
else:
new_scores[qid] = s
return new_scores
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
if not qid_list:
total = len(exact_scores)
return collections.OrderedDict([
('exact', 100.0 * sum(exact_scores.values()) / total),
('f1', 100.0 * sum(f1_scores.values()) / total),
('total', total),
])
else:
total = len(qid_list)
return collections.OrderedDict([
('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
('total', total),
])
def merge_eval(main_eval, new_eval, prefix):
for k in new_eval:
main_eval['%s_%s' % (prefix, k)] = new_eval[k]
def plot_pr_curve(precisions, recalls, out_image, title):
plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.title(title)
plt.savefig(out_image)
plt.clf()
def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
out_image=None, title=None):
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
true_pos = 0.0
cur_p = 1.0
cur_r = 0.0
precisions = [1.0]
recalls = [0.0]
avg_prec = 0.0
for i, qid in enumerate(qid_list):
if qid_to_has_ans[qid]:
true_pos += scores[qid]
cur_p = true_pos / float(i+1)
cur_r = true_pos / float(num_true_pos)
if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
# i.e., if we can put a threshold after this point
avg_prec += cur_p * (cur_r - recalls[-1])
precisions.append(cur_p)
recalls.append(cur_r)
if out_image:
plot_pr_curve(precisions, recalls, out_image, title)
return {'ap': 100.0 * avg_prec}
def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs,
qid_to_has_ans, out_image_dir):
if out_image_dir and not os.path.exists(out_image_dir):
os.makedirs(out_image_dir)
num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
if num_true_pos == 0:
return
pr_exact = make_precision_recall_eval(
exact_raw, na_probs, num_true_pos, qid_to_has_ans,
out_image=os.path.join(out_image_dir, 'pr_exact.png'),
title='Precision-Recall curve for Exact Match score')
pr_f1 = make_precision_recall_eval(
f1_raw, na_probs, num_true_pos, qid_to_has_ans,
out_image=os.path.join(out_image_dir, 'pr_f1.png'),
title='Precision-Recall curve for F1 score')
oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
pr_oracle = make_precision_recall_eval(
oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
merge_eval(main_eval, pr_exact, 'pr_exact')
merge_eval(main_eval, pr_f1, 'pr_f1')
merge_eval(main_eval, pr_oracle, 'pr_oracle')
def histogram_na_prob(na_probs, qid_list, image_dir, name):
if not qid_list:
return
x = [na_probs[k] for k in qid_list]
weights = np.ones_like(x) / float(len(x))
plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
plt.xlabel('Model probability of no-answer')
plt.ylabel('Proportion of dataset')
plt.title('Histogram of no-answer probability: %s' % name)
plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
plt.clf()
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
best_score = cur_score
best_thresh = 0.0
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
for i, qid in enumerate(qid_list):
if qid not in scores: continue
if qid_to_has_ans[qid]:
diff = scores[qid]
else:
if preds[qid]:
diff = -1
else:
diff = 0
cur_score += diff
if cur_score > best_score:
best_score = cur_score
best_thresh = na_probs[qid]
return 100.0 * best_score / len(scores), best_thresh
def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
best_score = cur_score
best_thresh = 0.0
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
for i, qid in enumerate(qid_list):
if qid not in scores: continue
if qid_to_has_ans[qid]:
diff = scores[qid]
else:
if preds[qid]:
diff = -1
else:
diff = 0
cur_score += diff
if cur_score > best_score:
best_score = cur_score
best_thresh = na_probs[qid]
has_ans_score, has_ans_cnt = 0, 0
for qid in qid_list:
if not qid_to_has_ans[qid]: continue
has_ans_cnt += 1
if qid not in scores: continue
has_ans_score += scores[qid]
return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
main_eval['best_exact'] = best_exact
main_eval['best_exact_thresh'] = exact_thresh
main_eval['best_f1'] = best_f1
main_eval['best_f1_thresh'] = f1_thresh
def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
main_eval['best_exact'] = best_exact
main_eval['best_exact_thresh'] = exact_thresh
main_eval['best_f1'] = best_f1
main_eval['best_f1_thresh'] = f1_thresh
main_eval['has_ans_exact'] = has_ans_exact
main_eval['has_ans_f1'] = has_ans_f1
def main(OPTS):
with open(OPTS.data_file) as f:
dataset_json = json.load(f)
dataset = dataset_json['data']
with open(OPTS.pred_file) as f:
preds = json.load(f)
if OPTS.na_prob_file:
with open(OPTS.na_prob_file) as f:
na_probs = json.load(f)
else:
na_probs = {k: 0.0 for k in preds}
qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(dataset, preds)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
OPTS.na_prob_thresh)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
OPTS.na_prob_thresh)
out_eval = make_eval_dict(exact_thresh, f1_thresh)
if has_ans_qids:
has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
merge_eval(out_eval, has_ans_eval, 'HasAns')
if no_ans_qids:
no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
merge_eval(out_eval, no_ans_eval, 'NoAns')
if OPTS.na_prob_file:
find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
if OPTS.na_prob_file and OPTS.out_image_dir:
run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs,
qid_to_has_ans, OPTS.out_image_dir)
histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
if OPTS.out_file:
with open(OPTS.out_file, 'w') as f:
json.dump(out_eval, f)
else:
print(json.dumps(out_eval, indent=2))
return out_eval
if __name__ == '__main__':
OPTS = parse_args()
if OPTS.out_image_dir:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
main(OPTS)
......@@ -72,7 +72,6 @@ setup(
'transformers-cli'
],
# python_requires='>=3.5.0',
tests_require=['pytest'],
classifiers=[
'Intended Audience :: Science/Research',
'License :: OSI Approved :: Apache Software License',
......
......@@ -18,11 +18,11 @@ from __future__ import print_function
import unittest
import shutil
import pytest
import sys
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_tf, slow
from transformers import XxxConfig, is_tf_available
......@@ -33,10 +33,9 @@ if is_tf_available():
TFXxxForTokenClassification,
TFXxxForQuestionAnswering,
TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
else:
pytestmark = pytest.mark.skip("Require TensorFlow")
@require_tf
class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
......@@ -244,7 +243,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
@pytest.mark.slow
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in ['xxx-base-uncased']:
......
......@@ -18,12 +18,12 @@ from __future__ import print_function
import unittest
import shutil
import pytest
from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
if is_torch_available():
from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
......@@ -31,10 +31,9 @@ if is_torch_available():
XxxForQuestionAnswering, XxxForSequenceClassification,
XxxForTokenClassification, XxxForMultipleChoice)
from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
else:
pytestmark = pytest.mark.skip("Require Torch")
@require_torch
class XxxModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
......@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = XxxModel(config=config)
model.to(torch_device)
model.eval()
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
......@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = XxxForMaskedLM(config=config)
model.to(torch_device)
model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
result = {
......@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = XxxForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
start_positions=sequence_labels, end_positions=sequence_labels)
......@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = XxxForSequenceClassification(config)
model.to(torch_device)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
result = {
......@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = XxxForTokenClassification(config=config)
model.to(torch_device)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
result = {
......@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
@pytest.mark.slow
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......
......@@ -26,7 +26,9 @@ from .data import (is_sklearn_available,
InputExample, InputFeatures, DataProcessor,
glue_output_modes, glue_convert_examples_to_features,
glue_processors, glue_tasks_num_labels,
xnli_output_modes, xnli_processors, xnli_tasks_num_labels)
xnli_output_modes, xnli_processors, xnli_tasks_num_labels,
squad_convert_examples_to_features, SquadFeatures,
SquadExample, SquadV1Processor, SquadV2Processor)
if is_sklearn_available():
from .data import glue_compute_metrics, xnli_compute_metrics
......@@ -35,6 +37,7 @@ if is_sklearn_available():
from .tokenization_utils import (PreTrainedTokenizer)
from .tokenization_auto import AutoTokenizer
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
from .tokenization_gpt2 import GPT2Tokenizer
......@@ -164,6 +167,7 @@ if is_tf_available():
from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
TFDistilBertModel, TFDistilBertForMaskedLM,
TFDistilBertForSequenceClassification,
TFDistilBertForTokenClassification,
TFDistilBertForQuestionAnswering,
TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
......@@ -174,6 +178,8 @@ if is_tf_available():
from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM,
TFAlbertForSequenceClassification,
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
# Optimization
from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator)
# TF 2.0 <=> PyTorch conversion utilities
from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
......
......@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json"
}
......
......@@ -24,7 +24,7 @@ import logging
import os
from io import open
from .file_utils import cached_path, CONFIG_NAME
from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
logger = logging.getLogger(__name__)
......@@ -131,8 +131,10 @@ class PretrainedConfig(object):
config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
else:
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
config_file = pretrained_model_name_or_path
else:
config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
# redirect to the cache, if necessary
try:
resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
......@@ -187,7 +189,7 @@ class PretrainedConfig(object):
@classmethod
def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters."""
"""Constructs a `Config` from a json file of parameters."""
with open(json_file, "r", encoding='utf-8') as reader:
text = reader.read()
return cls.from_dict(json.loads(text))
......
......@@ -119,10 +119,11 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False) # build the network
pt_model = pt_model_class.from_pretrained(None,
state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
config=config,
state_dict=torch.load(pytorch_checkpoint_path,
map_location='cpu'))
state_dict=state_dict)
pt_inputs = torch.tensor(inputs_list)
with torch.no_grad():
pto = pt_model(pt_inputs)
......@@ -139,7 +140,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
if args_model_type is None:
......@@ -187,13 +188,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
if os.path.isfile(model_shortcut_name):
model_shortcut_name = 'converted_model'
convert_pt_checkpoint_to_tf(model_type=model_type,
pytorch_checkpoint_path=model_file,
config_file=config_file,
tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
compare_with_pt_model=compare_with_pt_model)
os.remove(config_file)
os.remove(model_file)
if remove_cached_files:
os.remove(config_file)
os.remove(model_file)
if __name__ == "__main__":
......@@ -226,6 +229,9 @@ if __name__ == "__main__":
parser.add_argument("--use_cached_models",
action='store_true',
help = "Use cached models if possible instead of updating to latest checkpoint versions.")
parser.add_argument("--remove_cached_files",
action='store_true',
help = "Remove pytorch models after conversion (save memory when converting in batches).")
parser.add_argument("--only_convert_finetuned_models",
action='store_true',
help = "Only convert finetuned models.")
......@@ -245,4 +251,5 @@ if __name__ == "__main__":
config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
compare_with_pt_model=args.compare_with_pt_model,
use_cached_models=args.use_cached_models,
remove_cached_files=args.remove_cached_files,
only_convert_finetuned_models=args.only_convert_finetuned_models)
from .processors import InputExample, InputFeatures, DataProcessor
from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor
from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
from .metrics import is_sklearn_available
......
""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
In addition to basic functionality, we also compute additional statistics and
plot precision-recall curves if an additional na_prob.json file is provided.
This file is expected to map question ID's to the model's predicted probability
that a question is unanswerable.
"""
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Load SQuAD dataset. """
from __future__ import absolute_import, division, print_function
import json
import logging
......@@ -24,481 +14,371 @@ import math
import collections
from io import open
from tqdm import tqdm
import string
import re
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
logger = logging.getLogger(__name__)
class SquadExample(object):
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
return re.sub(regex, ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s:
return []
return normalize_answer(s).split()
def compute_exact(a_gold, a_pred):
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
def compute_f1(a_gold, a_pred):
gold_toks = get_tokens(a_gold)
pred_toks = get_tokens(a_pred)
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def get_raw_scores(examples, preds):
"""
A single training/test example for the Squad dataset.
For examples without an answer, the start and end position are -1.
Computes the exact and f1 scores from the examples and the model predictions
"""
exact_scores = {}
f1_scores = {}
for example in examples:
qas_id = example.qas_id
gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
if not gold_answers:
# For unanswerable questions, only correct answer is empty string
gold_answers = ['']
if qas_id not in preds:
print('Missing prediction for %s' % qas_id)
continue
prediction = preds[qas_id]
exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
return exact_scores, f1_scores
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (self.qas_id)
s += ", question_text: %s" % (
self.question_text)
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.end_position:
s += ", end_position: %d" % (self.end_position)
if self.is_impossible:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
cls_index,
p_mask,
paragraph_len,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.cls_index = cls_index
self.p_mask = p_mask
self.paragraph_len = paragraph_len
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def read_squad_examples(input_file, is_training, version_2_with_negative):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer.")
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(
whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
def convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training,
cls_token_at_end=False,
cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
sequence_a_segment_id=0, sequence_b_segment_id=1,
cls_token_segment_id=0, pad_token_segment_id=0,
mask_padding_with_zero=True,
sequence_a_is_doc=False):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
# cnt_pos, cnt_neg = 0, 0
# max_N, max_M = 1024, 1024
# f = np.zeros((max_N, max_M), dtype=np.float32)
features = []
for (example_index, example) in enumerate(tqdm(examples)):
# if example_index % 100 == 0:
# logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
new_scores = {}
for qid, s in scores.items():
pred_na = na_probs[qid] > na_prob_thresh
if pred_na:
new_scores[qid] = float(not qid_to_has_ans[qid])
else:
new_scores[qid] = s
return new_scores
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
if not qid_list:
total = len(exact_scores)
return collections.OrderedDict([
('exact', 100.0 * sum(exact_scores.values()) / total),
('f1', 100.0 * sum(f1_scores.values()) / total),
('total', total),
])
else:
total = len(qid_list)
return collections.OrderedDict([
('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
('total', total),
])
def merge_eval(main_eval, new_eval, prefix):
for k in new_eval:
main_eval['%s_%s' % (prefix, k)] = new_eval[k]
def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
best_score = cur_score
best_thresh = 0.0
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
for i, qid in enumerate(qid_list):
if qid not in scores:
continue
if qid_to_has_ans[qid]:
diff = scores[qid]
else:
if preds[qid]:
diff = -1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
assert max_tokens_for_doc > 0
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# Original TF implem also keep the classification token (set to 0) (not sure why...)
p_mask = []
# CLS token at the beginning
if not cls_token_at_end:
tokens.append(cls_token)
segment_ids.append(cls_token_segment_id)
p_mask.append(0)
cls_index = 0
# XLNet: P SEP Q SEP CLS
# Others: CLS Q SEP P SEP
if not sequence_a_is_doc:
# Query
tokens += query_tokens
segment_ids += [sequence_a_segment_id] * len(query_tokens)
p_mask += [1] * len(query_tokens)
# SEP token
tokens.append(sep_token)
segment_ids.append(sequence_a_segment_id)
p_mask.append(1)
# Paragraph
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = _check_is_max_context(doc_spans, doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
if not sequence_a_is_doc:
segment_ids.append(sequence_b_segment_id)
else:
segment_ids.append(sequence_a_segment_id)
p_mask.append(0)
paragraph_len = doc_span.length
if sequence_a_is_doc:
# SEP token
tokens.append(sep_token)
segment_ids.append(sequence_a_segment_id)
p_mask.append(1)
tokens += query_tokens
segment_ids += [sequence_b_segment_id] * len(query_tokens)
p_mask += [1] * len(query_tokens)
# SEP token
tokens.append(sep_token)
segment_ids.append(sequence_b_segment_id)
p_mask.append(1)
# CLS token at the end
if cls_token_at_end:
tokens.append(cls_token)
segment_ids.append(cls_token_segment_id)
p_mask.append(0)
cls_index = len(tokens) - 1 # Index of classification token
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(pad_token)
input_mask.append(0 if mask_padding_with_zero else 1)
segment_ids.append(pad_token_segment_id)
p_mask.append(1)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
span_is_impossible = example.is_impossible
start_position = None
end_position = None
if is_training and not span_is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
span_is_impossible = True
else:
if sequence_a_is_doc:
doc_offset = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and span_is_impossible:
start_position = cls_index
end_position = cls_index
if example_index < 20:
logger.info("*** Example ***")
logger.info("unique_id: %s" % (unique_id))
logger.info("example_index: %s" % (example_index))
logger.info("doc_span_index: %s" % (doc_span_index))
logger.info("tokens: %s" % " ".join(tokens))
logger.info("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
logger.info("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info(
"input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info(
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
if is_training and span_is_impossible:
logger.info("impossible example")
if is_training and not span_is_impossible:
answer_text = " ".join(tokens[start_position:(end_position + 1)])
logger.info("start_position: %d" % (start_position))
logger.info("end_position: %d" % (end_position))
logger.info(
"answer: %s" % (answer_text))
features.append(
InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
cls_index=cls_index,
p_mask=p_mask,
paragraph_len=paragraph_len,
start_position=start_position,
end_position=end_position,
is_impossible=span_is_impossible))
unique_id += 1
return features
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
diff = 0
cur_score += diff
if cur_score > best_score:
best_score = cur_score
best_thresh = na_probs[qid]
has_ans_score, has_ans_cnt = 0, 0
for qid in qid_list:
if not qid_to_has_ans[qid]:
continue
has_ans_cnt += 1
if qid not in scores:
continue
has_ans_score += scores[qid]
return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(
preds, exact_raw, na_probs, qid_to_has_ans)
best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(
preds, f1_raw, na_probs, qid_to_has_ans)
main_eval['best_exact'] = best_exact
main_eval['best_exact_thresh'] = exact_thresh
main_eval['best_f1'] = best_f1
main_eval['best_f1_thresh'] = f1_thresh
main_eval['has_ans_exact'] = has_ans_exact
main_eval['has_ans_f1'] = has_ans_f1
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
cur_score = num_no_ans
best_score = cur_score
best_thresh = 0.0
qid_list = sorted(na_probs, key=lambda k: na_probs[k])
for _, qid in enumerate(qid_list):
if qid not in scores:
continue
if qid_to_has_ans[qid]:
diff = scores[qid]
else:
if preds[qid]:
diff = -1
else:
diff = 0
cur_score += diff
if cur_score > best_score:
best_score = cur_score
best_thresh = na_probs[qid]
return 100.0 * best_score / len(scores), best_thresh
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
main_eval['best_exact'] = best_exact
main_eval['best_exact_thresh'] = exact_thresh
main_eval['best_f1'] = best_f1
main_eval['best_f1_thresh'] = f1_thresh
def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
if no_answer_probs is None:
no_answer_probs = {k: 0.0 for k in preds}
exact, f1 = get_raw_scores(examples, preds)
exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
evaluation = make_eval_dict(exact_threshold, f1_threshold)
if has_answer_qids:
has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
merge_eval(evaluation, has_ans_eval, 'HasAns')
if no_answer_qids:
no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
merge_eval(evaluation, no_ans_eval, 'NoAns')
if no_answer_probs:
find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
return evaluation
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# However, this is not always possible. Consider the following:
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
# We don't want to return `orig_text` because it contains the extra "'s".
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
# What we really want to return is "Steve Smith".
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
# Therefore, we have to apply a semi-complicated alignment heuristic between
# `pred_text` and `orig_text` to get a character-to-character alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose_logging:
logger.info(
"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose_logging:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose_logging:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
return cur_span_index == best_span_index
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
def write_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file, verbose_logging,
version_2_with_negative, null_score_diff_threshold):
def compute_predictions_logits(
all_examples,
all_features,
all_results,
n_best_size,
max_answer_length,
do_lower_case,
output_prediction_file,
output_nbest_file,
output_null_log_odds_file,
verbose_logging,
version_2_with_negative,
null_score_diff_threshold
):
"""Write final predictions to the json file and log-odds of null if needed."""
logger.info("Writing predictions to: %s" % (output_prediction_file))
logger.info("Writing nbest to: %s" % (output_nbest_file))
......@@ -627,12 +507,12 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
if len(nbest)==1:
if len(nbest) == 1:
nbest.insert(0,
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
......@@ -689,18 +569,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
return all_predictions
# For XLNet (and XLM which uses the same head)
RawResultExtended = collections.namedtuple("RawResultExtended",
["unique_id", "start_top_log_probs", "start_top_index",
"end_top_log_probs", "end_top_index", "cls_logits"])
def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
max_answer_length, output_prediction_file,
output_nbest_file,
output_null_log_odds_file, orig_data_file,
start_n_top, end_n_top, version_2_with_negative,
tokenizer, verbose_logging):
def compute_predictions_log_probs(
all_examples,
all_features,
all_results,
n_best_size,
max_answer_length,
output_prediction_file,
output_nbest_file,
output_null_log_odds_file,
start_n_top,
end_n_top,
version_2_with_negative,
tokenizer,
verbose_logging
):
""" XLNet write prediction logic (more complex than Bert's).
Write final predictions to the json file and log-odds of null if needed.
......@@ -709,7 +592,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction",
["feature_index", "start_index", "end_index",
"start_log_prob", "end_log_prob"])
"start_log_prob", "end_log_prob"])
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
......@@ -746,12 +629,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
for i in range(start_n_top):
for j in range(end_n_top):
start_log_prob = result.start_top_log_probs[i]
start_log_prob = result.start_logits[i]
start_index = result.start_top_index[i]
j_index = i * end_n_top + j
end_log_prob = result.end_top_log_probs[j_index]
end_log_prob = result.end_logits[j_index]
end_index = result.end_top_index[j_index]
# We could hypothetically create invalid predictions, e.g., predict
......@@ -792,7 +675,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
# XLNet un-tokenizer
# Let's keep it simple for now and see if we need all this later.
#
#
# tok_start_to_orig_index = feature.tok_start_to_orig_index
# tok_end_to_orig_index = feature.tok_end_to_orig_index
# start_orig_pos = tok_start_to_orig_index[pred.start_index]
......@@ -812,7 +695,12 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
if hasattr(tokenizer, "do_lower_case"):
do_lower_case = tokenizer.do_lower_case
else:
do_lower_case = tokenizer.do_lowercase_and_remove_accent
final_text = get_final_text(tok_text, orig_text, do_lower_case,
verbose_logging)
if final_text in seen_predictions:
......@@ -872,146 +760,4 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
with open(orig_data_file, "r", encoding='utf-8') as reader:
orig_data = json.load(reader)["data"]
qid_to_has_ans = make_qid_to_has_ans(orig_data)
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
out_eval = {}
find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
return out_eval
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heuristic between
# `pred_text` and `orig_text` to get a character-to-character alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose_logging:
logger.info(
"Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in tok_ns_to_s_map.items():
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose_logging:
logger.info("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose_logging:
logger.info("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
return all_predictions
from .utils import InputExample, InputFeatures, DataProcessor
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
\ No newline at end of file
from tqdm import tqdm
import collections
import logging
import os
import json
import numpy as np
from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
from .utils import DataProcessor, InputExample, InputFeatures
from ...file_utils import is_tf_available, is_torch_available
if is_torch_available():
import torch
from torch.utils.data import TensorDataset
if is_tf_available():
import tensorflow as tf
logger = logging.getLogger(__name__)
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _new_check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# if len(doc_spans) == 1:
# return True
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span["start"] + doc_span["length"] - 1
if position < doc_span["start"]:
continue
if position > end:
continue
num_left_context = position - doc_span["start"]
num_right_context = end - position
score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def _is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training,
return_dataset=False):
"""
Converts a list of examples into a list of features that can be directly given as input to a model.
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
max_seq_length: The maximum sequence length of the inputs.
doc_stride: The stride used when the context is too large and is split across several features.
max_query_length: The maximum length of the query.
is_training: whether to create features for model evaluation or model training.
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset,
if 'tf': returns a tf.data.Dataset
Returns:
list of :class:`~transformers.data.processors.squad.SquadFeatures`
Example::
processor = SquadV2Processor()
examples = processor.get_dev_examples(data_dir)
features = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=not evaluate,
)
"""
# Defining helper methods
unique_id = 1000000000
features = []
for (example_index, example) in enumerate(tqdm(examples)):
if is_training and not example.is_impossible:
# Get start and end position
start_position = example.start_position
end_position = example.end_position
# If the answer cannot be found in the text, then skip this example.
actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
continue
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
)
spans = []
truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
span_doc_tokens = all_doc_tokens
while len(spans) * doc_stride < len(all_doc_tokens):
encoded_dict = tokenizer.encode_plus(
truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
max_length=max_seq_length,
return_overflowing_tokens=True,
pad_to_max_length=True,
stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
)
paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
if tokenizer.pad_token_id in encoded_dict['input_ids']:
non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
else:
non_padded_ids = encoded_dict['input_ids']
tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
token_to_orig_map = {}
for i in range(paragraph_len):
index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
encoded_dict["paragraph_len"] = paragraph_len
encoded_dict["tokens"] = tokens
encoded_dict["token_to_orig_map"] = token_to_orig_map
encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
encoded_dict["token_is_max_context"] = {}
encoded_dict["start"] = len(spans) * doc_stride
encoded_dict["length"] = paragraph_len
spans.append(encoded_dict)
if "overflowing_tokens" not in encoded_dict:
break
span_doc_tokens = encoded_dict["overflowing_tokens"]
for doc_span_index in range(len(spans)):
for j in range(spans[doc_span_index]["paragraph_len"]):
is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
spans[doc_span_index]["token_is_max_context"][index] = is_max_context
for span in spans:
# Identify the position of the CLS token
cls_index = span['input_ids'].index(tokenizer.cls_token_id)
# p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
# Original TF implem also keep the classification token (set to 0) (not sure why...)
p_mask = np.array(span['token_type_ids'])
p_mask = np.minimum(p_mask, 1)
if tokenizer.padding_side == "right":
# Limit positive values to one
p_mask = 1 - p_mask
p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
# Set the CLS index to '0'
p_mask[cls_index] = 0
span_is_impossible = example.is_impossible
start_position = 0
end_position = 0
if is_training and not span_is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = span["start"]
doc_end = span["start"] + span["length"] - 1
out_of_span = False
if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = cls_index
end_position = cls_index
span_is_impossible = True
else:
if tokenizer.padding_side == "left":
doc_offset = 0
else:
doc_offset = len(truncated_query) + sequence_added_tokens
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
features.append(SquadFeatures(
span['input_ids'],
span['attention_mask'],
span['token_type_ids'],
cls_index,
p_mask.tolist(),
example_index=example_index,
unique_id=unique_id,
paragraph_len=span['paragraph_len'],
token_is_max_context=span["token_is_max_context"],
tokens=span["tokens"],
token_to_orig_map=span["token_to_orig_map"],
start_position=start_position,
end_position=end_position
))
unique_id += 1
if return_dataset == 'pt':
if not is_torch_available():
raise ImportError("Pytorch must be installed to return a pytorch dataset.")
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
if not is_training:
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
all_example_index, all_cls_index, all_p_mask)
else:
all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
all_start_positions, all_end_positions,
all_cls_index, all_p_mask)
return features, dataset
return features
class SquadProcessor(DataProcessor):
"""
Processor for the SQuAD data set.
Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
"""
train_file = None
dev_file = None
def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
if not evaluate:
answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
answer_start = tensor_dict['answers']['answer_start'][0].numpy()
answers = []
else:
answers = [{
"answer_start": start.numpy(),
"text": text.numpy().decode('utf-8')
} for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])]
answer = None
answer_start = None
return SquadExample(
qas_id=tensor_dict['id'].numpy().decode("utf-8"),
question_text=tensor_dict['question'].numpy().decode('utf-8'),
context_text=tensor_dict['context'].numpy().decode('utf-8'),
answer_text=answer,
start_position_character=answer_start,
title=tensor_dict['title'].numpy().decode('utf-8'),
answers=answers
)
def get_examples_from_dataset(self, dataset, evaluate=False):
"""
Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
Args:
dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
evaluate: boolean specifying if in evaluation mode or in training mode
Returns:
List of SquadExample
Examples::
import tensorflow_datasets as tfds
dataset = tfds.load("squad")
training_examples = get_examples_from_dataset(dataset, evaluate=False)
evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
"""
if evaluate:
dataset = dataset["validation"]
else:
dataset = dataset["train"]
examples = []
for tensor_dict in tqdm(dataset):
examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
return examples
def get_train_examples(self, data_dir, filename=None):
"""
Returns the training examples from the data directory.
Args:
data_dir: Directory containing the data files used for training and evaluating.
filename: None by default, specify this if the training file has a different name than the original one
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
"""
if self.train_file is None:
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
return self._create_examples(input_data, "train")
def get_dev_examples(self, data_dir, filename=None):
"""
Returns the evaluation example from the data directory.
Args:
data_dir: Directory containing the data files used for training and evaluating.
filename: None by default, specify this if the evaluation file has a different name than the original one
which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
"""
if self.dev_file is None:
raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader:
input_data = json.load(reader)["data"]
return self._create_examples(input_data, "dev")
def _create_examples(self, input_data, set_type):
is_training = set_type == "train"
examples = []
for entry in tqdm(input_data):
title = entry['title']
for paragraph in entry["paragraphs"]:
context_text = paragraph["context"]
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position_character = None
answer_text = None
answers = []
if "is_impossible" in qa:
is_impossible = qa["is_impossible"]
else:
is_impossible = False
if not is_impossible:
if is_training:
answer = qa["answers"][0]
answer_text = answer['text']
start_position_character = answer['answer_start']
else:
answers = qa["answers"]
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
context_text=context_text,
answer_text=answer_text,
start_position_character=start_position_character,
title=title,
is_impossible=is_impossible,
answers=answers
)
examples.append(example)
return examples
class SquadV1Processor(SquadProcessor):
train_file = "train-v1.1.json"
dev_file = "dev-v1.1.json"
class SquadV2Processor(SquadProcessor):
train_file = "train-v2.0.json"
dev_file = "dev-v2.0.json"
class SquadExample(object):
"""
A single training/test example for the Squad dataset, as loaded from disk.
Args:
qas_id: The example's unique identifier
question_text: The question string
context_text: The context string
answer_text: The answer string
start_position_character: The character position of the start of the answer
title: The title of the example
answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
is_impossible: False by default, set to True if the example has no possible answer.
"""
def __init__(self,
qas_id,
question_text,
context_text,
answer_text,
start_position_character,
title,
answers=[],
is_impossible=False):
self.qas_id = qas_id
self.question_text = question_text
self.context_text = context_text
self.answer_text = answer_text
self.title = title
self.is_impossible = is_impossible
self.answers = answers
self.start_position, self.end_position = 0, 0
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
# Split on whitespace so that different tokens may be attributed to their original position.
for c in self.context_text:
if _is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
self.doc_tokens = doc_tokens
self.char_to_word_offset = char_to_word_offset
# Start end end positions only has a value during evaluation.
if start_position_character is not None and not is_impossible:
self.start_position = char_to_word_offset[start_position_character]
self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
class SquadFeatures(object):
"""
Single squad example features to be fed to a model.
Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
Args:
input_ids: Indices of input sequence tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding token indices.
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
cls_index: the index of the CLS token.
p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
example_index: the index of the example
unique_id: The unique Feature identifier
paragraph_len: The length of the context
token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
If a token does not have their maximum context in this feature object, it means that another feature object
has more information related to that token and should be prioritized over this feature for that token.
tokens: list of tokens corresponding to the input ids
token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
start_position: start of the answer token index
end_position: end of the answer token index
"""
def __init__(self,
input_ids,
attention_mask,
token_type_ids,
cls_index,
p_mask,
example_index,
unique_id,
paragraph_len,
token_is_max_context,
tokens,
token_to_orig_map,
start_position,
end_position
):
self.input_ids = input_ids
self.attention_mask = attention_mask
self.token_type_ids = token_type_ids
self.cls_index = cls_index
self.p_mask = p_mask
self.example_index = example_index
self.unique_id = unique_id
self.paragraph_len = paragraph_len
self.token_is_max_context = token_is_max_context
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.start_position = start_position
self.end_position = end_position
class SquadResult(object):
"""
Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
Args:
unique_id: The unique identifier corresponding to that example.
start_logits: The logits corresponding to the start of the answer
end_logits: The logits corresponding to the end of the answer
"""
def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
self.start_logits = start_logits
self.end_logits = end_logits
self.unique_id = unique_id
if start_top_index:
self.start_top_index = start_top_index
self.end_top_index = end_top_index
self.cls_logits = cls_logits
\ No newline at end of file
......@@ -21,7 +21,7 @@ import boto3
from botocore.config import Config
from botocore.exceptions import ClientError
import requests
from tqdm import tqdm
from tqdm.auto import tqdm
from contextlib import contextmanager
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
......@@ -73,6 +73,8 @@ TF2_WEIGHTS_NAME = 'tf_model.h5'
TF_WEIGHTS_NAME = 'model.ckpt'
CONFIG_NAME = "config.json"
S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
def is_torch_available():
return _torch_available
......@@ -103,6 +105,18 @@ else:
return fn
return docstring_decorator
def is_remote_url(url_or_filename):
parsed = urlparse(url_or_filename)
return parsed.scheme in ('http', 'https', 's3')
def hf_bucket_url(identifier, postfix=None):
if postfix is None:
return "/".join((S3_BUCKET_PREFIX, identifier))
else:
return "/".join((S3_BUCKET_PREFIX, identifier, postfix))
def url_to_filename(url, etag=None):
"""
Convert `url` into a hashed filename in a repeatable way.
......@@ -171,9 +185,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
parsed = urlparse(url_or_filename)
if parsed.scheme in ('http', 'https', 's3'):
if is_remote_url(url_or_filename):
# URL, so get it from the cache (downloading if necessary)
return get_from_cache(url_or_filename, cache_dir=cache_dir,
force_download=force_download, proxies=proxies,
......@@ -181,7 +193,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
elif os.path.exists(url_or_filename):
# File, and it exists.
return url_or_filename
elif parsed.scheme == '':
elif urlparse(url_or_filename).scheme == '':
# File, but it doesn't exist.
raise EnvironmentError("file {} not found".format(url_or_filename))
else:
......@@ -245,7 +257,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0):
return
content_length = response.headers.get('Content-Length')
total = resume_size + int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total, initial=resume_size)
progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading")
for chunk in response.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
......
......@@ -16,10 +16,11 @@ from __future__ import absolute_import, division, print_function
import os
from os.path import expanduser
import six
import requests
import six
from requests.exceptions import HTTPError
from tqdm import tqdm
ENDPOINT = "https://huggingface.co"
......@@ -129,10 +130,13 @@ class HfApi:
# Even though we presign with the correct content-type,
# the client still has to specify it when uploading the file.
with open(filepath, "rb") as f:
pf = TqdmProgressFileReader(f)
r = requests.put(urls.write, data=f, headers={
"content-type": urls.type,
})
r.raise_for_status()
pf.close()
return urls.access
def list_objs(self, token):
......@@ -148,6 +152,34 @@ class HfApi:
class TqdmProgressFileReader:
"""
Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
and override `f.read()` so as to display a tqdm progress bar.
see github.com/huggingface/transformers/pull/2078#discussion_r354739608
for implementation details.
"""
def __init__(
self,
f # type: io.BufferedReader
):
self.f = f
self.total_size = os.fstat(f.fileno()).st_size # type: int
self.pbar = tqdm(total=self.total_size, leave=False)
if six.PY3:
# does not work unless PY3
# no big deal as the CLI does not currently support PY2 anyways.
self.read = f.read
f.read = self._read
def _read(self, n=-1):
self.pbar.update(n)
return self.read(n)
def close(self):
self.pbar.close()
class HfFolder:
......
......@@ -28,7 +28,6 @@ from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassifica
from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
from .modeling_utils import PreTrainedModel, SequenceSummary
......
# coding=utf-8
# Copyright (c) 2019 Yang Liu
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
A general wrapper around models with LM heads to generate sequences
using beam search.
"""
import torch
from torch import nn
class TransformerBeamSearch(nn.Module):
def __init__(
self,
model,
tokenizer,
batch_size,
beam_size,
min_length,
max_length,
alpha=0,
block_repeating_trigram=True,
):
"""
Attributes:
mask_word_id: token id that corresponds to the mask
"""
super(TransformerBeamSearch, self).__init__()
self.model = model
self.tokenizer = tokenizer
self.start_token_id = tokenizer.start_token_id
self.end_token_id = tokenizer.end_token_id
self.pad_token_id = tokenizer.pad_token_id
self.beam_size = beam_size
self.min_length = min_length
self.max_length = max_length
self.block_repeating_trigram = block_repeating_trigram
self.apply_length_penalty = False if alpha == 0 else True
self.alpha = alpha
# State of the beam
self.hypotheses = [[] for _ in range(batch_size)]
self.batch_offset = torch.arange(batch_size, dtype=torch.long)
self.beam_offset = torch.arange(
0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long
)
self.growing_beam = torch.full(
(batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
)
self.topk_log_probabilities = torch.tensor(
[0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float
).repeat(batch_size)
self.results = {
"prediction": [[] for _ in batch_size],
"scores": [[] for _ in batch_size],
}
self._step = 0
self.is_done = False
def step(self, log_probabilities):
""" Grows the beam by one step. """
self._step += 1
# The batch size changes as some beams finish so we define _B
vocab_size = log_probabilities.size(-1)
_B = log_probabilities.size(0) // self.beam_size
# Multiply each beam probability with the probability of the
# next token (conditioned on the words in the beam).
log_probabilities += self.topk_log_probabilities.view(-1, 1)
self.enforce_min_length(log_probabilities)
if self.block_repeating_trigram:
self.remove_repeating_trigrams(log_probabilities, _B)
# Find the `beam_size` (previous_beam + token) combinations with
# the highest score
topk_log_probabilities, topk_ids = log_probabilities.topk(
log_probabilities.view(_B, self.beam_size * vocab_size),
self.beam_size,
dim=1,
)
# Apply the length penalty. The +1 accounts for the [EOS] token
# that will be added if the beam ends.
topk_scores = topk_log_probabilities / self.length_penalty()
# Retrieve the corresponding respective beam and token id
# topk_token_ids[i] will be added to topk_beam_ids[i]
topk_beam_ids = topk_ids.div(vocab_size)
topk_token_ids = topk_ids.fmod(vocab_size)
# Retrieve the row index of the surviving beams in the original
# view of the log_probabilities tensor
surviving_beams_rows = (topk_beam_ids + self.beam_offset[:_B].view(-1, 1)).view(
-1
)
# Append the last predictions
self.growing_beam = torch.cat(
[
self.growing_beam.index_select(0, surviving_beams_rows),
topk_token_ids.view(-1, 1),
],
1,
)
# Check if any of the beam searches has ended during this
# growth step. Also if top beam (most probable) has ended
# for one element of the batch.
is_finished = topk_token_ids.eq(self.end_token_id)
self.enforce_max_length()
is_top_beam_finished = is_finished[:, 0].eq(1)
# Save the finished searches
if is_finished.any():
predictions = self.growing_beam.view(
-1, self.beam_size, self.growing_beam.size(1)
)
for i in range(is_finished.size(0)):
if is_top_beam_finished[i]:
is_finished[i].fill_(1)
finished_hyp = is_finished[i].nonzero().view(-1)
# Store finished hypotheses for this batch.
b = self.batch_offset[i]
for j in finished_hyp:
self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
# If the batch reached the end, save the best hypotheses
# in terms of length-penalized score.
if is_top_beam_finished[i]:
best_hyp = sorted(
self.hypotheses[b], key=lambda x: x[0], reverse=True
)
best_score, best_prediction = best_hyp[0]
self.results["scores"][b].append(best_score)
self.results["predictions"][b].append(best_prediction)
non_finished = is_top_beam_finished.eq(0).nonzero().view(-1)
if len(non_finished) == 0:
self.is_done = True
# Remove finished batches for the next step.
topk_log_probabilities = topk_log_probabilities.index_select(
0, non_finished
)
self.batch_offset = self.batch_offset.index_select(0, non_finished)
self.growing_beam = predictions.index_select(0, non_finished).view(
-1, self.growing_beam.size(-1)
)
surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished)
return surviving_beams_rows
def forward(self, encoder_input_ids, **kwargs):
# keyword arguments come in 3 flavors: encoder-specific (prefixed by
# `encoder_`), decoder-specific (prefixed by `decoder_`) and those
# that apply to the model as whole.
# We let the specific kwargs override the common ones in case of conflict.
kwargs_encoder = {
argument[len("encoder_"):]: value
for argument, value in kwargs.items()
if argument.startswith("encoder_")
}
kwargs_decoder = {
argument[len("decoder_"):]: value
for argument, value in kwargs.items()
if argument.startswith("decoder_")
}
kwargs_common = {
argument: value
for argument, value in kwargs.items()
if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
}
kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
# forward pass on the encoder
encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder)
kwargs_decoder["encoder_hidden_states"] = tile(
encoder_outputs, self.beam_size, dim=0
)
# grow the beam by generating sequences in an autoregressive way
self.growing_beam = torch.full(
(self.batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
)
for step in range(self.max_length):
decoder_input = self.growing_beam[:, -1]
outputs = self.model.decoder(decoder_input, kwargs_decoder)
log_probabilities = torch.nn.functional.log_softmax(outputs[1])
surviving_beams_rows = self.step(log_probabilities)
if self.is_done:
break
kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
"encoder_hidden_states"
].index_select(0, surviving_beams_rows)
return self.results
def remove_repeating_trigrams(self, log_probabilities, _B):
if(self._step + 1 > 3):
for i in range(_B * self.beam_size):
tokens = [t for t in self.growing_beam[i]]
trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)]
last_trigram = tuple(trigrams[-1])
if last_trigram in trigrams[:-1]:
log_probabilities[i] = -1e20
def enforce_min_length(self):
if self._step < self.min_length:
self.log_probabilities[self.end_token_id] = -1e20
def enforce_max_length(self):
if self._step + 1 == self.max_length:
self.is_finished.fill_(1)
def length_penalty(self):
return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha
def tile(x, count, dim=0):
"""
Tiles `x` along dimension `dim` `count` times.
Example:
>> ex = torch.tensor([1,2],[3,4])
>> tile(ex, 2, 0)
torch.Tensor([[1,2],[1,2],[3,4],[3,4]])
"""
perm = list(range(len(x.size())))
if dim != 0:
perm[0], perm[dim] = perm[dim], perm[0]
x = x.permute(perm).contiguous()
out_size = list(x.size())
out_size[0] *= count
batch = x.size(0)
x = (
x.view(batch, -1)
.transpose(0, 1)
.repeat(count, 1)
.transpose(0, 1)
.contiguous()
.view(*out_size)
)
if dim != 0:
x = x.permute(perm).contiguous()
return x
......@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
}
......@@ -667,18 +671,20 @@ class BertModel(BertPreTrainedModel):
# ourselves in which case we just need to make it broadcastable to all heads.
if attention_mask.dim() == 3:
extended_attention_mask = attention_mask[:, None, :, :]
# Provided a padding mask of dimensions [batch_size, seq_length]
# - if the model is a decoder, apply a causal mask in addition to the padding mask
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
if attention_mask.dim() == 2:
elif attention_mask.dim() == 2:
# Provided a padding mask of dimensions [batch_size, seq_length]
# - if the model is a decoder, apply a causal mask in addition to the padding mask
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder:
batch_size, seq_length = input_shape
seq_ids = torch.arange(seq_length, device=device)
causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
causal_mask = causal_mask.to(torch.long) # not converting to long will cause errors with pytorch version < 1.3
extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
else:
extended_attention_mask = attention_mask[:, None, None, :]
else:
raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape))
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
......@@ -690,14 +696,19 @@ class BertModel(BertPreTrainedModel):
# If a 2D ou 3D attention mask is provided for the cross-attention
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
if self.config.is_decoder:
if self.config.is_decoder and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
if encoder_attention_mask is None:
encoder_attention_mask = torch.ones(input_shape, device=device)
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
if encoder_attention_mask.dim() == 3:
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
if encoder_attention_mask.dim() == 2:
elif encoder_attention_mask.dim() == 2:
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
else:
raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape,
encoder_attention_mask.shape))
encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
......@@ -1226,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
input_ids = tokenizer.encode(input_text)
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
# a nice puppet
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment