Commit e4d852a0 authored by Jon Tow's avatar Jon Tow
Browse files

Clean up

parent f4f7618a
......@@ -15,8 +15,7 @@ class DROP(Task):
URL = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
def download(self):
if self.DATAFOLDER.exists():
return
if self.DATAFOLDER.exists(): return
Path.mkdir(self.DATAFOLDER)
download_file(self.URL, to=str(self.DATAFOLDER / "drop_dataset.zip"))
with ZipFile(self.DATAFOLDER / "drop_dataset.zip", "r") as zip:
......@@ -39,6 +38,7 @@ class DROP(Task):
for doc in docs:
for qa in doc["qa_pairs"]:
yield {
"id": qa["query_id"],
"passage": doc["passage"],
"question": qa["question"],
"answers": self.get_answers(qa["answer"]),
......@@ -48,7 +48,7 @@ class DROP(Task):
def get_answers(cls, answers):
# NOTE: We wrap every non-`list` answer into a list for uniformity.
if answers["number"] != "":
return [answers["number"]]
return [str(answers["number"])]
if answers["spans"] != []:
return answers["spans"]
return [" ".join([answers["date"]["day"],
......@@ -85,7 +85,7 @@ class DROP(Task):
"""
conts = []
for _ in doc["answers"]:
conts.append(rf.greedy_until(ctx, ["\n", "."]))
conts.append(rf.greedy_until(ctx, ["."]))
return conts
def process_results(self, doc, results):
......@@ -98,12 +98,13 @@ class DROP(Task):
:param results:
The results of the requests created in construct_requests.
"""
gold, pred = doc["answers"], results
print(gold)
print(pred)
exact_match = self._exact_match(gold, pred)
f1_score = self._f1_score(gold, pred)
return {"em": exact_match, "f1": f1_score}
golds, preds = doc["answers"], results
exact_match = self._exact_match(golds, preds)
f1_score = self._f1_score(golds, preds)
return {
"em": exact_match,
"f1": f1_score
}
def _exact_match(self, golds, preds):
""" Returns the exact match of normalized gold answers and predictions. """
......@@ -112,13 +113,9 @@ class DROP(Task):
return int(normalized_golds == normalized_preds)
def _f1_score(self, golds, preds):
"""Returns the average F1-score over normalized `gold` and `pred`
answer lists.
"""
"""Returns the average F1-score over normalized gold answers and predictions. """
gold_bags = self._answer_to_bags(golds)
print("GOLD BAGS: " + str(gold_bags))
pred_bags = self._answer_to_bags(preds)
print("PRED BAGS: " + str(pred_bags))
f1_per_bag = self._align_bags(gold_bags, pred_bags)
return np.mean(f1_per_bag)
......@@ -133,7 +130,6 @@ class DROP(Task):
print(self._is_number_match(gold_bag, pred_bag))
if self._is_number_match(gold_bag, pred_bag):
scores[gold_index, pred_index] = self._bag_f1(gold_bag, pred_bag)
print(scores)
row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold_bags), len(pred_bags))])
for row, column in zip(row_ind, col_ind):
......@@ -169,7 +165,10 @@ class DROP(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {"em": mean, "f1": mean}
return {
"em": mean,
"f1": mean
}
def higher_is_better(self):
"""
......@@ -178,60 +177,7 @@ class DROP(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {"em": True, "f1": True}
# Temporary sanity-checks
def main():
drop = DROP()
def test_bags():
multiple_answers = ["Pacific Ocean", "Pacific"]
ma_bags = drop._answer_to_bags(multiple_answers)
print(f"Multiple Choice Answer Bags: {multiple_answers} => {ma_bags}")
assert len(ma_bags) == 2
number_answer = ["1974"]
number_bags = drop._answer_to_bags(number_answer)
print(f"Number Bags: {number_answer} => {number_bags}")
print()
test_bags()
def test_is_number_match():
gold = ["10 29 1999"]
pred = ["4 29 1990"]
gb = drop._answer_to_bags(gold)
pb = drop._answer_to_bags(pred)
print(gb)
print(pb)
for g in gb:
for p in pb:
match = drop._is_number_match(g, p)
print(match)
print()
#test_is_number_match()
def test_exact_match():
gold = ["Bob Ross"]
pred = ["Bob Ross"]
em = drop._exact_match(gold, pred)
print(em)
#test_exact_match()
def test_f1_score():
gold = ["25 to 44"]
pred = ["25 to 44 or 45 to 64"]
f1 = drop._f1_score(gold, pred)
print(gold)
print(pred)
print(f1)
gold = ["300", "1992"]
pred = ["300", "1992"]
f1 = drop._f1_score(gold, pred)
print(f1)
#test_f1_score()
if __name__ == "__main__":
main()
return {
"em": True,
"f1": True
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment