Merge remote-tracking branch 'origin/big-refactor' into big-refactor_math_sympy

e2ed49f8 · baberabb · a554e41b · 91a37c90 · e2ed49f8 · e2ed49f8
Commit e2ed49f8 authored Sep 18, 2023 by baberabb
11 changed files
--- a/lm_eval/tasks/qasper/README.md
+++ b/lm_eval/tasks/qasper/README.md
+# QASPER
+
+### Paper
+
+Title: `A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers`
+
+Abstract: https://arxiv.org/abs/2105.03011
+
+QASPER is a dataset of 5,049 questions over 1,585 Natural Language Processing papers.
+Each question is written by an NLP practitioner who read only the title and abstract
+of the corresponding paper, and the question seeks information present in the full
+text. The questions are then answered by a separate set of NLP practitioners who also
+provide supporting evidence to answers.
+
+Homepage: https://allenai.org/data/qasper
+
+### Citation
+
+```
+@article{DBLP:journals/corr/abs-2105-03011,
+    author    = {Pradeep Dasigi and
+               Kyle Lo and
+               Iz Beltagy and
+               Arman Cohan and
+               Noah A. Smith and
+               Matt Gardner},
+    title     = {A Dataset of Information-Seeking Questions and Answers Anchored in
+               Research Papers},
+    journal   = {CoRR},
+    volume    = {abs/2105.03011},
+    year      = {2021},
+    url       = {https://arxiv.org/abs/2105.03011},
+    eprinttype = {arXiv},
+    eprint    = {2105.03011},
+    timestamp = {Fri, 14 May 2021 12:13:30 +0200},
+    biburl    = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib},
+    bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `qasper`: executes both `qasper_bool` and `qasper_freeform`
+
+#### Tasks
+
+* `qasper_bool`: Multiple choice task that evaluates the task with `answer_type="bool"`
+* `qasper_freeform`: Greedy generation task that evaluates the samples from the task with `answer_type="free form answer"`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/qasper/bool.yaml
+++ b/lm_eval/tasks/qasper/bool.yaml
+group: qasper
+task: qasper_bool
+dataset_path: qasper
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_bool
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: 1
+doc_to_choice: ["no", "yes"]
+metric_list:
+  - metric: f1
--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
+group: qasper
+task: qasper_freeform
+dataset_path: qasper
+output_type: greedy_until
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_freeform
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: answer
+generation_kwargs:
+  until:
+    - "\n"
+metric_list:
+  - metric: !function metrics.f1_abstractive
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/qasper/metrics.py
+++ b/lm_eval/tasks/qasper/metrics.py
+import re
+import string
+
+from collections import Counter
+
+
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_abstractive(predictions, references):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+    prediction_tokens = normalize_answer(predictions[0]).split()
+    references_tokens = normalize_answer(references[0]).split()
+    common = Counter(prediction_tokens) & Counter(references_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(references_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
--- a/lm_eval/tasks/qasper/utils.py
+++ b/lm_eval/tasks/qasper/utils.py
+from datasets import Dataset
+from functools import partial
+
+
+def process_docs(dataset, set_answer_type="bool"):
+
+    FEATURES = ["title", "abstract", "question", "answer", "answer_type"]
+
+    def _categorise_answer(answer_blob):
+        if answer_blob["unanswerable"]:
+            answer = "unanswerable"
+            answer_type = "unanswerable"
+            return answer, answer_type
+        elif answer_blob["yes_no"]:
+            answer = "yes"
+            answer_type = "bool"
+            return answer, answer_type
+        elif answer_blob["free_form_answer"]:
+            answer = answer_blob["free_form_answer"]
+            answer_type = "free form answer"
+            return answer, answer_type
+        elif answer_blob["extractive_spans"]:
+            answer = answer_blob["extractive_spans"]
+            answer_type = "extractive_spans"
+            return answer, answer_type
+        elif answer_blob["yes_no"] is False:
+            answer = "no"
+            answer_type = "bool"
+            return answer, answer_type
+
+    def _flatten(doc):
+        """Given a `doc`, flatten it out so that each JSON blob
+        contains exactly one question and one answer. Logic taken from
+        the reference implementation available at
+        https://github.com/allenai/qasper-led-baseline/blob/main/scripts/evaluator.py
+        """
+        obs_list = {
+            "title": [],
+            "abstract": [],
+            "question": [],
+            "answer": [],
+            "answer_type": [],
+        }
+        title = doc.pop("title")
+        abstract = doc.pop("abstract")
+        for question, answer_list in zip(doc["qas"]["question"], doc["qas"]["answers"]):
+            for answer_blob in answer_list["answer"]:
+                answer, answer_type = _categorise_answer(answer_blob)
+                if answer_type == set_answer_type:
+                    obs_list["title"].append(title)
+                    obs_list["abstract"].append(abstract)
+                    obs_list["question"].append(question)
+                    obs_list["answer_type"].append(answer_type)
+                    if type(answer) == list:
+                        answer = ", ".join(answer)
+                    obs_list["answer"].append(answer)
+
+        return obs_list
+
+    dataset = dataset.map(
+        _flatten,
+        remove_columns=[key for key in dataset.features.keys() if key not in FEATURES],
+    )
+    new_dataset = {}
+    for key in dataset.features.keys():
+        new_dataset[key] = [x for row in dataset[key] for x in row]
+
+    return Dataset.from_dict(new_dataset)
+
+
+process_docs_bool = partial(process_docs, set_answer_type="bool")
+process_docs_freeform = partial(process_docs, set_answer_type="free form answer")
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
@@ -10,7 +10,7 @@ try:
 except ModuleNotFoundError:
    raise Exception(
        "`pycountry` is required for generating translation task prompt templates. \
-please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
+please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]",
    )



--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -16,7 +16,6 @@ import gc
 import torch
 import transformers

-from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice

@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
    args_string = args_string.strip()
    if not args_string:
        return {}
-    arg_list = args_string.split(",")
-    args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
+    arg_list = [arg for arg in args_string.split(",") if arg]
+    args_dict = {k: v for k, v in [arg.split("=") for arg in arg_list]}
    return args_dict


@@ -395,8 +394,10 @@ def import_function(loader, node):
    function_name = loader.construct_scalar(node)
    yaml_path = os.path.dirname(loader.name)

-    module_name, function_name = function_name.split(".")
-    module_path = os.path.join(yaml_path, "{}.py".format(module_name))
+    *module_name, function_name = function_name.split(".")
+    if type(module_name) == list:
+        module_name = ".".join(module_name)
+    module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name)))

    spec = importlib.util.spec_from_file_location(module_name, module_path)
    module = importlib.util.module_from_spec(spec)
@@ -430,8 +431,7 @@ def load_yaml_config(yaml_path):
                # If not found, assume the included yaml
                # is in the same dir as the original yaml
                if not os.path.isfile(path):
-                    path = os.path.join(yaml_dir, path)
-
+                    path = os.path.normpath(os.path.join(yaml_dir, path))
                try:
                    included_yaml_config = load_yaml_config(path)
                    final_yaml_config.update(included_yaml_config)

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,10 +25,8 @@ dependencies = [
    "evaluate>=0.4.0",
    "jsonlines",
    "numexpr",
-    "omegaconf>=2.2",
    "peft>=0.2.0",
    "pybind11>=2.6.2",
-    "pycountry",
    "pytablewriter",
    "rouge-score>=0.0.4",
    "sacrebleu>=1.5.0",
@@ -65,8 +63,8 @@ linting = [
    "pre-commit",
 ]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
-multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1"]
-sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1", "pycountry"]
+multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
+sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
 promptsource = [
    "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
 ]

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -38,13 +38,15 @@ def main():
        iters = []

        for set in args.sets.split(","):
+            docs = None
            if set == "train" and task.has_training_docs():
                docs = task.training_docs()
            if set == "val" and task.has_validation_docs():
                docs = task.validation_docs()
            if set == "test" and task.has_test_docs():
                docs = task.test_docs()
-            iters.append(docs)
+            if docs is not None:
+                iters.append(docs)

        docs = join_iters(iters)


--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
+from typing import List
 import random
 import pytest

@@ -26,7 +27,7 @@ import pytest
        )
    ],
 )
-def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
+def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
    task_name = task_name
    limit = 10


--- a/tests/utils.py
+++ b/tests/utils.py
@@ -9,6 +9,7 @@ import os
 # This is the path where the output for the changed files for the tasks folder is stored
 # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"

+
 # reads a text file and returns a list of words
 # used to read the output of the changed txt from tj-actions/changed-files
 def load_changed_files(file_path: str) -> List[str]:
@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]:
    return list(_output)


-def new_tasks() -> Union[list[str], None]:
+def new_tasks() -> Union[List[str], None]:
    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
    if os.path.exists(FILENAME):
        # If tasks folder has changed then we get the list of files from FILENAME