Commit 79aa53b1 authored by lintangsutawika's avatar lintangsutawika
Browse files

running process for drop

parent f9558ce5
...@@ -6,7 +6,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -6,7 +6,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] Glue - [x] Glue
- [x] SuperGlue - [x] SuperGlue
- [ ] CoQA (Lintang) - [ ] CoQA (Lintang)
- [ ] DROP (Lintang) - [x] DROP
- [x] ~~Lambada~~ - [x] ~~Lambada~~
- [x] Lambada (Cloze variants) - [x] Lambada (Cloze variants)
- [x] ~~Lambada (Multilingual)~~ - [x] ~~Lambada (Multilingual)~~
......
...@@ -19,19 +19,25 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r ...@@ -19,19 +19,25 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r
### Citation ### Citation
``` ```
BibTeX-formatted citation goes here @misc{dua2019drop,
title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
year={2019},
eprint={1903.00161},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
``` ```
### Groups and Tasks ### Groups and Tasks
#### Groups #### Groups
* `group_name`: `Short description` * Not part of a group yet.
#### Tasks #### Tasks
* `task_name`: `1-sentence description of what this particular task does` * `drop`
* `task_name2`: ...
### Checklist ### Checklist
......
...@@ -2,8 +2,21 @@ task: drop ...@@ -2,8 +2,21 @@ task: drop
dataset_path: EleutherAI/drop dataset_path: EleutherAI/drop
output_type: greedy_until output_type: greedy_until
training_split: train training_split: train
validation_split: test validation_split: validation
doc_to_text: "Passage: {{passage}}\nQuestion: {{question}}\nAnswer:" process_docs: !function utils.process_docs
doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}" doc_to_text: "{{passage}} {{question}}"
doc_to_target: "{{ answer|join(',')}}"
target_delimiter: ""
process_results: !function utils.process_results
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "{{passage}} {{question}}" doc_to_decontamination_query: "{{passage}} {{question}}"
generation_kwargs:
until:
- "."
metric_list:
- metric: em
aggregation: mean
higher_is_better: true
- metric: f1
aggregation: mean
higher_is_better: true
import re
import string
def process_doc(dataset): import numpy as np
from scipy.optimize import linear_sum_assignment
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
def process_docs(dataset):
def _process(doc): def _process(doc):
return { return {
"id": doc["query_id"], "id": doc["query_id"],
...@@ -8,6 +15,7 @@ def process_doc(dataset): ...@@ -8,6 +15,7 @@ def process_doc(dataset):
"question": doc["question"], "question": doc["question"],
"answers": get_answers(doc), "answers": get_answers(doc),
} }
return dataset.map(_process) return dataset.map(_process)
...@@ -30,9 +38,7 @@ def get_answers(doc): ...@@ -30,9 +38,7 @@ def get_answers(doc):
answers = [] answers = []
answers_set = set() answers_set = set()
candidates = [doc["answer"]] + _flatten_validated_answers( candidates = [doc["answer"]] + _flatten_validated_answers(doc["validated_answers"])
doc["validated_answers"]
)
for candidate in candidates: for candidate in candidates:
answer = parse_answer(candidate) answer = parse_answer(candidate)
if answer in answers_set: if answer in answers_set:
...@@ -41,6 +47,7 @@ def get_answers(doc): ...@@ -41,6 +47,7 @@ def get_answers(doc):
answers.append(answer) answers.append(answer)
return answers return answers
def parse_answer(answer): def parse_answer(answer):
# NOTE: Everything is returned as a tuple for uniformity and hashability. # NOTE: Everything is returned as a tuple for uniformity and hashability.
if answer["number"] != "": if answer["number"] != "":
...@@ -51,4 +58,147 @@ def parse_answer(answer): ...@@ -51,4 +58,147 @@ def parse_answer(answer):
" ".join( " ".join(
[answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]] [answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]]
).strip(), ).strip(),
) )
\ No newline at end of file
def process_results(doc, results):
preds, golds = results, doc["answers"]
max_em = 0
max_f1 = 0
for gold_answer in golds:
exact_match, f1_score = get_metrics(preds, gold_answer)
if gold_answer[0].strip():
max_em = max(max_em, exact_match)
max_f1 = max(max_f1, f1_score)
return {"em": max_em, "f1": max_f1}
def get_metrics(predicted, gold):
"""
Takes a predicted answer and a gold answer (that are both either a string or a list of
strings), and returns exact match and the DROP F1 metric for the prediction. If you are
writing a script for evaluating objects in memory (say, the output of predictions during
validation, or while training), this is the function you want to call, after using
:func:`answer_json_to_strings` when reading the gold answer from the released data file.
"""
predicted_bags = _answer_to_bags(predicted)
gold_bags = _answer_to_bags(gold)
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(
gold_bags[0]
):
exact_match = 1.0
else:
exact_match = 0.0
f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
f1 = np.mean(f1_per_bag)
f1 = round(f1, 2)
return exact_match, f1
def _answer_to_bags(answer):
if isinstance(answer, (list, tuple)):
raw_spans = answer
else:
raw_spans = [answer]
normalized_spans = []
token_bags = []
for raw_span in raw_spans:
normalized_span = _normalize(raw_span)
normalized_spans.append(normalized_span)
token_bags.append(set(normalized_span.split()))
return normalized_spans, token_bags
def _align_bags(predicted, gold):
"""
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
"""
scores = np.zeros([len(gold), len(predicted)])
for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted):
if _match_numbers_if_present(gold_item, pred_item):
scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold), len(predicted))])
for row, column in zip(row_ind, col_ind):
max_scores[row] = max(max_scores[row], scores[row, column])
return max_scores
def _compute_f1(predicted_bag, gold_bag):
intersection = len(gold_bag.intersection(predicted_bag))
if not predicted_bag:
precision = 1.0
else:
precision = intersection / float(len(predicted_bag))
if not gold_bag:
recall = 1.0
else:
recall = intersection / float(len(gold_bag))
f1 = (
(2 * precision * recall) / (precision + recall)
if not (precision == 0.0 and recall == 0.0)
else 0.0
)
return f1
def _match_numbers_if_present(gold_bag, predicted_bag):
gold_numbers = set()
predicted_numbers = set()
for word in gold_bag:
if _is_number(word):
gold_numbers.add(word)
for word in predicted_bag:
if _is_number(word):
predicted_numbers.add(word)
if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
return True
return False
def _is_number(text):
try:
float(text)
return True
except ValueError:
return False
def _remove_articles(text):
return _ARTICLES.sub(" ", text)
def _white_space_fix(text):
return " ".join(text.split())
def _remove_punc(text):
exclude = set(string.punctuation)
if not _is_number(text):
return "".join(ch for ch in text if ch not in exclude)
else:
return text
def _fix_number(text):
return str(float(text)) if _is_number(text) else text
def _tokenize(text):
return re.split(" |-", text)
def _normalize(answer):
tokens = [
_white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower()))))
for token in _tokenize(answer)
]
tokens = [token for token in tokens if token.strip()]
normalized = " ".join(tokens).strip()
return normalized
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment