Merge branch 'big-refactor' into bump-deps

fa2ae334 · Hailey Schoelkopf · GitHub · 7c2687cb · 54a53d6f · fa2ae334
Unverified Commit fa2ae334 authored Sep 19, 2023 by Hailey Schoelkopf Committed by GitHub Sep 19, 2023
18 changed files
--- a/lm_eval/tasks/mutual/multual_plus.yaml
+++ b/lm_eval/tasks/mutual/multual_plus.yaml
+include: mutual.yaml
+task: mutual_plus
+dataset_name: mutual_plus
--- a/lm_eval/tasks/mutual/mutual.yaml
+++ b/lm_eval/tasks/mutual/mutual.yaml
+task: mutual
+dataset_path: "EleutherAI/mutual"
+dataset_name: mutual
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{article}}"
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answers)}}"
+doc_to_choice: "{{options}}"
+process_docs: !function utils.process_docs
+process_results: !function utils.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{article}}"
+metric_list:
+  - metric: r@1
+    aggregation: mean
+    higher_is_better: true
+  - metric: r@2
+    aggregation: mean
+    higher_is_better: true
+  - metric: mrr
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/mutual/utils.py
+++ b/lm_eval/tasks/mutual/utils.py
+import numpy as np
+
+
+def process_docs(dataset):
+    def _detokenize(text):
+        text = text.replace(" '", "'")
+        text = text.replace(" \n", "\n")
+        text = text.replace("\n ", "\n")
+        text = text.replace(" n't", "n't")
+        text = text.replace("`` ", '"')
+        text = text.replace("''", '"')
+        # punctuation
+        text = text.replace(" :", ":")
+        text = text.replace(" ;", ";")
+        text = text.replace(" !", "!")
+        text = text.replace(" ?", "?")
+        text = text.replace(" ,", ",")
+        text = text.replace(" .", ".")
+        return text
+
+    def _process(doc):
+        return {
+            "article": _detokenize(doc["article"]),
+            "options": [_detokenize(option) for option in doc["options"]],
+        }
+
+    return dataset.map(_process)
+
+
+def process_results(doc, results):
+    gold = ["A", "B", "C", "D"].index(doc["answers"])
+    r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
+    ranks = sorted(results, reverse=True)
+    r4_2 = (ranks.index(results[gold]) == 1) + r4_1
+    mrr = 1.0 / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
+    return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}
--- a/lm_eval/tasks/qasper/README.md
+++ b/lm_eval/tasks/qasper/README.md
+# QASPER
+
+### Paper
+
+Title: `A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers`
+
+Abstract: https://arxiv.org/abs/2105.03011
+
+QASPER is a dataset of 5,049 questions over 1,585 Natural Language Processing papers.
+Each question is written by an NLP practitioner who read only the title and abstract
+of the corresponding paper, and the question seeks information present in the full
+text. The questions are then answered by a separate set of NLP practitioners who also
+provide supporting evidence to answers.
+
+Homepage: https://allenai.org/data/qasper
+
+### Citation
+
+```
+@article{DBLP:journals/corr/abs-2105-03011,
+    author    = {Pradeep Dasigi and
+               Kyle Lo and
+               Iz Beltagy and
+               Arman Cohan and
+               Noah A. Smith and
+               Matt Gardner},
+    title     = {A Dataset of Information-Seeking Questions and Answers Anchored in
+               Research Papers},
+    journal   = {CoRR},
+    volume    = {abs/2105.03011},
+    year      = {2021},
+    url       = {https://arxiv.org/abs/2105.03011},
+    eprinttype = {arXiv},
+    eprint    = {2105.03011},
+    timestamp = {Fri, 14 May 2021 12:13:30 +0200},
+    biburl    = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib},
+    bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `qasper`: executes both `qasper_bool` and `qasper_freeform`
+
+#### Tasks
+
+* `qasper_bool`: Multiple choice task that evaluates the task with `answer_type="bool"`
+* `qasper_freeform`: Greedy generation task that evaluates the samples from the task with `answer_type="free form answer"`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/qasper/bool.yaml
+++ b/lm_eval/tasks/qasper/bool.yaml
+group: qasper
+task: qasper_bool
+dataset_path: qasper
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_bool
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: 1
+doc_to_choice: ["no", "yes"]
+metric_list:
+  - metric: f1
--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
+group: qasper
+task: qasper_freeform
+dataset_path: qasper
+output_type: greedy_until
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_freeform
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: answer
+generation_kwargs:
+  until:
+    - "\n"
+metric_list:
+  - metric: !function metrics.f1_abstractive
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/qasper/metrics.py
+++ b/lm_eval/tasks/qasper/metrics.py
+import re
+import string
+
+from collections import Counter
+
+
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_abstractive(predictions, references):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+    prediction_tokens = normalize_answer(predictions[0]).split()
+    references_tokens = normalize_answer(references[0]).split()
+    common = Counter(prediction_tokens) & Counter(references_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(references_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
--- a/lm_eval/tasks/qasper/utils.py
+++ b/lm_eval/tasks/qasper/utils.py
+from datasets import Dataset
+from functools import partial
+
+
+def process_docs(dataset, set_answer_type="bool"):
+
+    FEATURES = ["title", "abstract", "question", "answer", "answer_type"]
+
+    def _categorise_answer(answer_blob):
+        if answer_blob["unanswerable"]:
+            answer = "unanswerable"
+            answer_type = "unanswerable"
+            return answer, answer_type
+        elif answer_blob["yes_no"]:
+            answer = "yes"
+            answer_type = "bool"
+            return answer, answer_type
+        elif answer_blob["free_form_answer"]:
+            answer = answer_blob["free_form_answer"]
+            answer_type = "free form answer"
+            return answer, answer_type
+        elif answer_blob["extractive_spans"]:
+            answer = answer_blob["extractive_spans"]
+            answer_type = "extractive_spans"
+            return answer, answer_type
+        elif answer_blob["yes_no"] is False:
+            answer = "no"
+            answer_type = "bool"
+            return answer, answer_type
+
+    def _flatten(doc):
+        """Given a `doc`, flatten it out so that each JSON blob
+        contains exactly one question and one answer. Logic taken from
+        the reference implementation available at
+        https://github.com/allenai/qasper-led-baseline/blob/main/scripts/evaluator.py
+        """
+        obs_list = {
+            "title": [],
+            "abstract": [],
+            "question": [],
+            "answer": [],
+            "answer_type": [],
+        }
+        title = doc.pop("title")
+        abstract = doc.pop("abstract")
+        for question, answer_list in zip(doc["qas"]["question"], doc["qas"]["answers"]):
+            for answer_blob in answer_list["answer"]:
+                answer, answer_type = _categorise_answer(answer_blob)
+                if answer_type == set_answer_type:
+                    obs_list["title"].append(title)
+                    obs_list["abstract"].append(abstract)
+                    obs_list["question"].append(question)
+                    obs_list["answer_type"].append(answer_type)
+                    if type(answer) == list:
+                        answer = ", ".join(answer)
+                    obs_list["answer"].append(answer)
+
+        return obs_list
+
+    dataset = dataset.map(
+        _flatten,
+        remove_columns=[key for key in dataset.features.keys() if key not in FEATURES],
+    )
+    new_dataset = {}
+    for key in dataset.features.keys():
+        new_dataset[key] = [x for row in dataset[key] for x in row]
+
+    return Dataset.from_dict(new_dataset)
+
+
+process_docs_bool = partial(process_docs, set_answer_type="bool")
+process_docs_freeform = partial(process_docs, set_answer_type="free form answer")
--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
+# Task-name
+
+### Paper
+
+Title: `paper title goes here`
+Abstract: `link to paper PDF or arXiv abstract goes here`
+
+`Short description of paper / benchmark goes here:`
+
+Homepage: `homepage to the benchmark's website goes here, if applicable`
+
+
+### Citation
+
+```
+BibTeX-formatted citation goes here
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: .....
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
+task: squadv2
+dataset_path: squad_v2
+output_type: greedy_until
+training_split: train
+validation_split: validation
+doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
+target_delimiter: ""
+should_decontaminate: true
+doc_to_decontamination_query: context
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.f1
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
+include: default.yaml
+task: squadv2_noans_loglikelihood
+dataset_path: squad_v2
+output_type: loglikelihood
+training_split: train
+validation_split: validation
+doc_to_target: " unanswerable"
+metric_list:
+  - metric: perplexity
--- a/lm_eval/tasks/squadv2/utils.py
+++ b/lm_eval/tasks/squadv2/utils.py
+import re
+import string
+import collections
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+# Exact match (the normalized answer exactly match the gold answer)
+def exact(predictions, references):
+    return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
+
+
+# The F-score of predicted tokens versus the gold answer
+def f1(predictions, references):
+    gold_toks = get_tokens(references[0])
+    pred_toks = get_tokens(predictions[0])
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
--- a/lm_eval/tasks/squadv2/with_noans_prob.yaml
+++ b/lm_eval/tasks/squadv2/with_noans_prob.yaml
+group: squadv2_complete
+task:
+  - squadv2
+  - squadv2_noans_loglikelihood
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
@@ -10,7 +10,7 @@ try:
 except ModuleNotFoundError:
    raise Exception(
        "`pycountry` is required for generating translation task prompt templates. \
-please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
+please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]",
    )



--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -16,7 +16,6 @@ import gc
 import torch
 import transformers

-from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice

@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
    args_string = args_string.strip()
    if not args_string:
        return {}
-    arg_list = args_string.split(",")
-    args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
+    arg_list = [arg for arg in args_string.split(",") if arg]
+    args_dict = {k: v for k, v in [arg.split("=") for arg in arg_list]}
    return args_dict


@@ -395,8 +394,10 @@ def import_function(loader, node):
    function_name = loader.construct_scalar(node)
    yaml_path = os.path.dirname(loader.name)

-    module_name, function_name = function_name.split(".")
-    module_path = os.path.join(yaml_path, "{}.py".format(module_name))
+    *module_name, function_name = function_name.split(".")
+    if type(module_name) == list:
+        module_name = ".".join(module_name)
+    module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name)))

    spec = importlib.util.spec_from_file_location(module_name, module_path)
    module = importlib.util.module_from_spec(spec)
@@ -430,8 +431,7 @@ def load_yaml_config(yaml_path):
                # If not found, assume the included yaml
                # is in the same dir as the original yaml
                if not os.path.isfile(path):
-                    path = os.path.join(yaml_dir, path)
-
+                    path = os.path.normpath(os.path.join(yaml_dir, path))
                try:
                    included_yaml_config = load_yaml_config(path)
                    final_yaml_config.update(included_yaml_config)

--- a/main.py
+++ b/main.py
@@ -11,7 +11,6 @@ from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
 from lm_eval.tasks import include_task_folder
-from lm_eval.benchmarks import include_benchmarks

 os.environ["TOKENIZERS_PARALLELISM"] = "false"


--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,10 +25,8 @@ dependencies = [
    "evaluate>=0.4.0",
    "jsonlines",
    "numexpr",
-    "omegaconf>=2.2",
    "peft>=0.2.0",
    "pybind11>=2.6.2",
-    "pycountry",
    "pytablewriter",
    "rouge-score>=0.0.4",
    "sacrebleu>=1.5.0",
@@ -65,8 +63,8 @@ linting = [
    "pre-commit",
 ]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
-multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1"]
-sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1", "pycountry"]
+multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
+sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
 promptsource = [
    "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
 ]

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -50,7 +50,9 @@ def main():

        docs = join_iters(iters)

-        with open(os.path.join(args.output_base_path, task_name), "w") as f:
+        with open(
+            os.path.join(args.output_base_path, task_name), "w", encoding="utf8"
+        ) as f:
            for i, doc in (
                zip(range(args.num_examples), docs)
                if args.num_examples > 0