Commit e4d852a0 authored by Jon Tow's avatar Jon Tow
Browse files

Clean up

parent f4f7618a
...@@ -15,8 +15,7 @@ class DROP(Task): ...@@ -15,8 +15,7 @@ class DROP(Task):
URL = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip" URL = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
def download(self): def download(self):
if self.DATAFOLDER.exists(): if self.DATAFOLDER.exists(): return
return
Path.mkdir(self.DATAFOLDER) Path.mkdir(self.DATAFOLDER)
download_file(self.URL, to=str(self.DATAFOLDER / "drop_dataset.zip")) download_file(self.URL, to=str(self.DATAFOLDER / "drop_dataset.zip"))
with ZipFile(self.DATAFOLDER / "drop_dataset.zip", "r") as zip: with ZipFile(self.DATAFOLDER / "drop_dataset.zip", "r") as zip:
...@@ -39,6 +38,7 @@ class DROP(Task): ...@@ -39,6 +38,7 @@ class DROP(Task):
for doc in docs: for doc in docs:
for qa in doc["qa_pairs"]: for qa in doc["qa_pairs"]:
yield { yield {
"id": qa["query_id"],
"passage": doc["passage"], "passage": doc["passage"],
"question": qa["question"], "question": qa["question"],
"answers": self.get_answers(qa["answer"]), "answers": self.get_answers(qa["answer"]),
...@@ -48,7 +48,7 @@ class DROP(Task): ...@@ -48,7 +48,7 @@ class DROP(Task):
def get_answers(cls, answers): def get_answers(cls, answers):
# NOTE: We wrap every non-`list` answer into a list for uniformity. # NOTE: We wrap every non-`list` answer into a list for uniformity.
if answers["number"] != "": if answers["number"] != "":
return [answers["number"]] return [str(answers["number"])]
if answers["spans"] != []: if answers["spans"] != []:
return answers["spans"] return answers["spans"]
return [" ".join([answers["date"]["day"], return [" ".join([answers["date"]["day"],
...@@ -76,16 +76,16 @@ class DROP(Task): ...@@ -76,16 +76,16 @@ class DROP(Task):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
:param doc: :param doc:
The document as returned from training_docs, validation_docs, or test_docs. The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str :param ctx: str
The context string, generated by fewshot_context. This includes the natural The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
part of the document for `doc`. part of the document for `doc`.
""" """
conts = [] conts = []
for _ in doc["answers"]: for _ in doc["answers"]:
conts.append(rf.greedy_until(ctx, ["\n", "."])) conts.append(rf.greedy_until(ctx, ["."]))
return conts return conts
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -94,16 +94,17 @@ class DROP(Task): ...@@ -94,16 +94,17 @@ class DROP(Task):
the metric for that one document the metric for that one document
:param :param
The document as returned from training_docs, validation_docs, or test_docs. The document as returned from training_docs, validation_docs, or test_docs.
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
gold, pred = doc["answers"], results golds, preds = doc["answers"], results
print(gold) exact_match = self._exact_match(golds, preds)
print(pred) f1_score = self._f1_score(golds, preds)
exact_match = self._exact_match(gold, pred) return {
f1_score = self._f1_score(gold, pred) "em": exact_match,
return {"em": exact_match, "f1": f1_score} "f1": f1_score
}
def _exact_match(self, golds, preds): def _exact_match(self, golds, preds):
""" Returns the exact match of normalized gold answers and predictions. """ """ Returns the exact match of normalized gold answers and predictions. """
...@@ -112,13 +113,9 @@ class DROP(Task): ...@@ -112,13 +113,9 @@ class DROP(Task):
return int(normalized_golds == normalized_preds) return int(normalized_golds == normalized_preds)
def _f1_score(self, golds, preds): def _f1_score(self, golds, preds):
"""Returns the average F1-score over normalized `gold` and `pred` """Returns the average F1-score over normalized gold answers and predictions. """
answer lists.
"""
gold_bags = self._answer_to_bags(golds) gold_bags = self._answer_to_bags(golds)
print("GOLD BAGS: " + str(gold_bags))
pred_bags = self._answer_to_bags(preds) pred_bags = self._answer_to_bags(preds)
print("PRED BAGS: " + str(pred_bags))
f1_per_bag = self._align_bags(gold_bags, pred_bags) f1_per_bag = self._align_bags(gold_bags, pred_bags)
return np.mean(f1_per_bag) return np.mean(f1_per_bag)
...@@ -133,7 +130,6 @@ class DROP(Task): ...@@ -133,7 +130,6 @@ class DROP(Task):
print(self._is_number_match(gold_bag, pred_bag)) print(self._is_number_match(gold_bag, pred_bag))
if self._is_number_match(gold_bag, pred_bag): if self._is_number_match(gold_bag, pred_bag):
scores[gold_index, pred_index] = self._bag_f1(gold_bag, pred_bag) scores[gold_index, pred_index] = self._bag_f1(gold_bag, pred_bag)
print(scores)
row_ind, col_ind = linear_sum_assignment(-scores) row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold_bags), len(pred_bags))]) max_scores = np.zeros([max(len(gold_bags), len(pred_bags))])
for row, column in zip(row_ind, col_ind): for row, column in zip(row_ind, col_ind):
...@@ -169,7 +165,10 @@ class DROP(Task): ...@@ -169,7 +165,10 @@ class DROP(Task):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
return {"em": mean, "f1": mean} return {
"em": mean,
"f1": mean
}
def higher_is_better(self): def higher_is_better(self):
""" """
...@@ -178,60 +177,7 @@ class DROP(Task): ...@@ -178,60 +177,7 @@ class DROP(Task):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
return {"em": True, "f1": True} return {
"em": True,
"f1": True
# Temporary sanity-checks }
def main():
drop = DROP()
def test_bags():
multiple_answers = ["Pacific Ocean", "Pacific"]
ma_bags = drop._answer_to_bags(multiple_answers)
print(f"Multiple Choice Answer Bags: {multiple_answers} => {ma_bags}")
assert len(ma_bags) == 2
number_answer = ["1974"]
number_bags = drop._answer_to_bags(number_answer)
print(f"Number Bags: {number_answer} => {number_bags}")
print()
test_bags()
def test_is_number_match():
gold = ["10 29 1999"]
pred = ["4 29 1990"]
gb = drop._answer_to_bags(gold)
pb = drop._answer_to_bags(pred)
print(gb)
print(pb)
for g in gb:
for p in pb:
match = drop._is_number_match(g, p)
print(match)
print()
#test_is_number_match()
def test_exact_match():
gold = ["Bob Ross"]
pred = ["Bob Ross"]
em = drop._exact_match(gold, pred)
print(em)
#test_exact_match()
def test_f1_score():
gold = ["25 to 44"]
pred = ["25 to 44 or 45 to 64"]
f1 = drop._f1_score(gold, pred)
print(gold)
print(pred)
print(f1)
gold = ["300", "1992"]
pred = ["300", "1992"]
f1 = drop._f1_score(gold, pred)
print(f1)
#test_f1_score()
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment