add qasper

2d530664 · lintangsutawika · af4b012e · 2d530664 · 2d530664 · 2d530664
Commit 2d530664 authored Sep 07, 2023 by lintangsutawika
5 changed files
--- a/lm_eval/tasks/qasper/README.md
+++ b/lm_eval/tasks/qasper/README.md
+# QASPER
+
+### Paper
+
+Title: `A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers`
+
+Abstract: https://arxiv.org/abs/2105.03011
+
+QASPER is a dataset of 5,049 questions over 1,585 Natural Language Processing papers.
+Each question is written by an NLP practitioner who read only the title and abstract
+of the corresponding paper, and the question seeks information present in the full
+text. The questions are then answered by a separate set of NLP practitioners who also
+provide supporting evidence to answers.
+
+Homepage: https://allenai.org/data/qasper
+
+### Citation
+
+```
+@article{DBLP:journals/corr/abs-2105-03011,
+    author    = {Pradeep Dasigi and
+               Kyle Lo and
+               Iz Beltagy and
+               Arman Cohan and
+               Noah A. Smith and
+               Matt Gardner},
+    title     = {A Dataset of Information-Seeking Questions and Answers Anchored in
+               Research Papers},
+    journal   = {CoRR},
+    volume    = {abs/2105.03011},
+    year      = {2021},
+    url       = {https://arxiv.org/abs/2105.03011},
+    eprinttype = {arXiv},
+    eprint    = {2105.03011},
+    timestamp = {Fri, 14 May 2021 12:13:30 +0200},
+    biburl    = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib},
+    bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `qasper`: `Short description`
+
+#### Tasks
+
+* `qasper_bool`: `1-sentence description of what this particular task does`
+* `qasper_freeform`: ...
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/qasper/bool.yaml
+++ b/lm_eval/tasks/qasper/bool.yaml
+group: qasper
+task: qasper_bool
+dataset_path: qasper
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_bool
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: 1
+doc_to_choice: ["no", "yes"]
+metric_list:
+  - metric: f1
--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
+group: qasper
+task: qasper_freeform
+dataset_path: qasper
+output_type: greedy_until
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_freeform
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: answer
+generation_kwargs:
+  until:
+    - "\n"
+metric_list:
+  - metric: !function metrics.f1_abstractive
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/qasper/metrics.py
+++ b/lm_eval/tasks/qasper/metrics.py
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+def f1_abstractive(prediction, ground_truth):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
\ No newline at end of file
--- a/lm_eval/tasks/qasper/utils.py
+++ b/lm_eval/tasks/qasper/utils.py
+from datasets import Dataset
+from functools import partial
+
+def process_docs(dataset, set_answer_type="bool"):
+
+    FEATURES = [
+        "title",
+        "abstract",
+        "question",
+        "answer",
+        "answer_type"
+        ]
+
+    def _categorise_answer(answer_blob):
+        if answer_blob["unanswerable"]:
+            answer = "unanswerable"
+            answer_type = "unanswerable"
+            return answer, answer_type
+        elif answer_blob["yes_no"]:
+            answer = "yes"
+            answer_type = "bool"
+            return answer, answer_type
+        elif answer_blob["free_form_answer"]:
+            answer = answer_blob["free_form_answer"]
+            answer_type = "free form answer"
+            return answer, answer_type
+        elif answer_blob["extractive_spans"]:
+            answer = answer_blob["extractive_spans"]
+            answer_type = "extractive_spans"
+            return answer, answer_type
+        elif answer_blob["yes_no"] is False:
+            answer = "no"
+            answer_type = "bool"
+            return answer, answer_type
+
+    def _flatten(doc):
+        """Given a `doc`, flatten it out so that each JSON blob
+        contains exactly one question and one answer. Logic taken from
+        the reference implementation available at
+        https://github.com/allenai/qasper-led-baseline/blob/main/scripts/evaluator.py
+        """
+        obs_list = {
+            "title": [],
+            "abstract": [],
+            "question": [],
+            "answer": [],
+            "answer_type": [],
+        }
+        title = doc.pop("title")
+        abstract = doc.pop("abstract")
+        for question, answer_list in zip(doc["qas"]["question"], doc["qas"]["answers"]):
+            for answer_blob in answer_list["answer"]:
+                answer, answer_type = _categorise_answer(answer_blob)
+                if answer_type == set_answer_type:
+                    obs_list["title"].append(title)
+                    obs_list["abstract"].append(abstract)
+                    obs_list["question"].append(question)
+                    obs_list["answer_type"].append(answer_type)
+                    if type(answer) == list:
+                        answer = ", ".join(answer)
+                    obs_list["answer"].append(answer)
+
+        return obs_list
+
+    dataset = dataset.map(_flatten, remove_columns=[key for key in dataset.features.keys() if key not in FEATURES])
+    new_dataset = {}
+    for key in dataset.features.keys():
+        new_dataset[key] = [x for row in dataset[key] for x in row]
+    
+    return Dataset.from_dict(new_dataset)
+
+process_docs_bool = partial(process_docs, set_answer_type="bool")
+process_docs_freeform = partial(process_docs, set_answer_type="freeform")