Merge branch 'big-refactor' into bigbench

2041dc34 · haileyschoelkopf · 67c0f73a · 15f4a3ef · 2041dc34 · 2041dc34
Commit 2041dc34 authored Oct 03, 2023 by haileyschoelkopf
20 changed files
--- a/lm_eval/tasks/pubmedqa/pubmedqa.yaml
+++ b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
-task: pubmed_qa
-dataset_path: pubmed_qa
-dataset_name: pqa_labeled
+task: pubmedqa
+dataset_path: bigbio/pubmed_qa
+dataset_name: pubmed_qa_labeled_fold0_source
 output_type: multiple_choice
-training_split: null
-validation_split: null
-test_split: train
+training_split: train
+validation_split: validation
+test_split: test
 doc_to_text: !function preprocess_pubmedqa.doc_to_text
 doc_to_target: final_decision
 doc_to_choice: ["yes", "no", "maybe"]

--- a/lm_eval/tasks/qasper/README.md
+++ b/lm_eval/tasks/qasper/README.md
+# QASPER
+
+### Paper
+
+Title: `A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers`
+
+Abstract: https://arxiv.org/abs/2105.03011
+
+QASPER is a dataset of 5,049 questions over 1,585 Natural Language Processing papers.
+Each question is written by an NLP practitioner who read only the title and abstract
+of the corresponding paper, and the question seeks information present in the full
+text. The questions are then answered by a separate set of NLP practitioners who also
+provide supporting evidence to answers.
+
+Homepage: https://allenai.org/data/qasper
+
+### Citation
+
+```
+@article{DBLP:journals/corr/abs-2105-03011,
+    author    = {Pradeep Dasigi and
+               Kyle Lo and
+               Iz Beltagy and
+               Arman Cohan and
+               Noah A. Smith and
+               Matt Gardner},
+    title     = {A Dataset of Information-Seeking Questions and Answers Anchored in
+               Research Papers},
+    journal   = {CoRR},
+    volume    = {abs/2105.03011},
+    year      = {2021},
+    url       = {https://arxiv.org/abs/2105.03011},
+    eprinttype = {arXiv},
+    eprint    = {2105.03011},
+    timestamp = {Fri, 14 May 2021 12:13:30 +0200},
+    biburl    = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib},
+    bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `qasper`: executes both `qasper_bool` and `qasper_freeform`
+
+#### Tasks
+
+* `qasper_bool`: Multiple choice task that evaluates the task with `answer_type="bool"`
+* `qasper_freeform`: Greedy generation task that evaluates the samples from the task with `answer_type="free form answer"`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/qasper/bool.yaml
+++ b/lm_eval/tasks/qasper/bool.yaml
+group: qasper
+task: qasper_bool
+dataset_path: qasper
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_bool
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: 1
+doc_to_choice: ["no", "yes"]
+metric_list:
+  - metric: f1
--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
+group: qasper
+task: qasper_freeform
+dataset_path: qasper
+output_type: greedy_until
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_freeform
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: answer
+generation_kwargs:
+  until:
+    - "\n"
+metric_list:
+  - metric: !function metrics.f1_abstractive
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/qasper/metrics.py
+++ b/lm_eval/tasks/qasper/metrics.py
+import re
+import string
+
+from collections import Counter
+
+
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_abstractive(predictions, references):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+    prediction_tokens = normalize_answer(predictions[0]).split()
+    references_tokens = normalize_answer(references[0]).split()
+    common = Counter(prediction_tokens) & Counter(references_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(references_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
--- a/lm_eval/tasks/qasper/utils.py
+++ b/lm_eval/tasks/qasper/utils.py
+from datasets import Dataset
+from functools import partial
+
+
+def process_docs(dataset, set_answer_type="bool"):
+
+    FEATURES = ["title", "abstract", "question", "answer", "answer_type"]
+
+    def _categorise_answer(answer_blob):
+        if answer_blob["unanswerable"]:
+            answer = "unanswerable"
+            answer_type = "unanswerable"
+            return answer, answer_type
+        elif answer_blob["yes_no"]:
+            answer = "yes"
+            answer_type = "bool"
+            return answer, answer_type
+        elif answer_blob["free_form_answer"]:
+            answer = answer_blob["free_form_answer"]
+            answer_type = "free form answer"
+            return answer, answer_type
+        elif answer_blob["extractive_spans"]:
+            answer = answer_blob["extractive_spans"]
+            answer_type = "extractive_spans"
+            return answer, answer_type
+        elif answer_blob["yes_no"] is False:
+            answer = "no"
+            answer_type = "bool"
+            return answer, answer_type
+
+    def _flatten(doc):
+        """Given a `doc`, flatten it out so that each JSON blob
+        contains exactly one question and one answer. Logic taken from
+        the reference implementation available at
+        https://github.com/allenai/qasper-led-baseline/blob/main/scripts/evaluator.py
+        """
+        obs_list = {
+            "title": [],
+            "abstract": [],
+            "question": [],
+            "answer": [],
+            "answer_type": [],
+        }
+        title = doc.pop("title")
+        abstract = doc.pop("abstract")
+        for question, answer_list in zip(doc["qas"]["question"], doc["qas"]["answers"]):
+            for answer_blob in answer_list["answer"]:
+                answer, answer_type = _categorise_answer(answer_blob)
+                if answer_type == set_answer_type:
+                    obs_list["title"].append(title)
+                    obs_list["abstract"].append(abstract)
+                    obs_list["question"].append(question)
+                    obs_list["answer_type"].append(answer_type)
+                    if type(answer) == list:
+                        answer = ", ".join(answer)
+                    obs_list["answer"].append(answer)
+
+        return obs_list
+
+    dataset = dataset.map(
+        _flatten,
+        remove_columns=[key for key in dataset.features.keys() if key not in FEATURES],
+    )
+    new_dataset = {}
+    for key in dataset.features.keys():
+        new_dataset[key] = [x for row in dataset[key] for x in row]
+
+    return Dataset.from_dict(new_dataset)
+
+
+process_docs_bool = partial(process_docs, set_answer_type="bool")
+process_docs_freeform = partial(process_docs, set_answer_type="free form answer")
--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
+# Task-name
+
+### Paper
+
+Title: `paper title goes here`
+Abstract: `link to paper PDF or arXiv abstract goes here`
+
+`Short description of paper / benchmark goes here:`
+
+Homepage: `homepage to the benchmark's website goes here, if applicable`
+
+
+### Citation
+
+```
+BibTeX-formatted citation goes here
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: .....
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
+task: squadv2
+dataset_path: squad_v2
+output_type: greedy_until
+training_split: train
+validation_split: validation
+doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
+target_delimiter: ""
+should_decontaminate: true
+doc_to_decontamination_query: context
+generation_kwargs:
+  until:
+    - "\n"
+# filter_list:
+#   - name: remove_whitespace
+#     filter:
+#       - function: remove_whitespace
+#       - function: take_first
+metric_list:
+  - metric: !function utils.exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.f1
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
+include: default.yaml
+task: squadv2_noans_loglikelihood
+dataset_path: squad_v2
+output_type: loglikelihood
+training_split: train
+validation_split: validation
+doc_to_target: " unanswerable"
+metric_list:
+  - metric: perplexity
--- a/lm_eval/tasks/squadv2/utils.py
+++ b/lm_eval/tasks/squadv2/utils.py
+import re
+import string
+import collections
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+# Exact match (the normalized answer exactly match the gold answer)
+def exact(predictions, references):
+    return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
+
+
+# The F-score of predicted tokens versus the gold answer
+def f1(predictions, references):
+    gold_toks = get_tokens(references[0])
+    pred_toks = get_tokens(predictions[0])
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
--- a/lm_eval/tasks/squadv2/with_noans_prob.yaml
+++ b/lm_eval/tasks/squadv2/with_noans_prob.yaml
+group: squadv2_complete
+task:
+  - squadv2
+  - squadv2_noans_loglikelihood
--- a/lm_eval/tasks/storycloze/storycloze_2018.yaml
+++ b/lm_eval/tasks/storycloze/storycloze_2018.yaml
 group: storycloze
-task: storycloze_2016
+task: storycloze_2018
 dataset_path: story_cloze
 dataset_name: 2018
 output_type: multiple_choice

--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
@@ -10,7 +10,7 @@ try:
 except ModuleNotFoundError:
    raise Exception(
        "`pycountry` is required for generating translation task prompt templates. \
-please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
+please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]",
    )



--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -16,7 +16,6 @@ import gc
 import torch
 import transformers

-from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice

@@ -46,6 +45,14 @@ def escaped_split(text, sep_char, maxsplit=-1):
    return re.split(r"(?<!\\)" + sep_char, text, maxsplit)


+def handle_arg_string(arg):
+    if arg.lower() == "true":
+        return True
+    elif arg.lower() == "false":
+        return False
+    return arg
+
+
 def simple_parse_args_string(args_string):
    """
    Parses something like
@@ -55,8 +62,10 @@ def simple_parse_args_string(args_string):
    args_string = args_string.strip()
    if not args_string:
        return {}
-    arg_list = args_string.split(",")
-    args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
+    arg_list = [arg for arg in args_string.split(",") if arg]
+    args_dict = {
+        k: handle_arg_string(v) for k, v in [arg.split("=") for arg in arg_list]
+    }
    return args_dict


@@ -267,9 +276,9 @@ def make_table(result_dict, column: str = "results"):
    from pytablewriter import MarkdownTableWriter, LatexTableWriter

    if column == "results":
-        column_name = "Task"
-    elif column == "aggregate":
-        column_name = "Benchmark"
+        column_name = "Tasks"
+    elif column == "groups":
+        column_name = "Groups"

    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
@@ -395,8 +404,10 @@ def import_function(loader, node):
    function_name = loader.construct_scalar(node)
    yaml_path = os.path.dirname(loader.name)

-    module_name, function_name = function_name.split(".")
-    module_path = os.path.join(yaml_path, "{}.py".format(module_name))
+    *module_name, function_name = function_name.split(".")
+    if type(module_name) == list:
+        module_name = ".".join(module_name)
+    module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name)))

    spec = importlib.util.spec_from_file_location(module_name, module_path)
    module = importlib.util.module_from_spec(spec)
@@ -430,8 +441,7 @@ def load_yaml_config(yaml_path):
                # If not found, assume the included yaml
                # is in the same dir as the original yaml
                if not os.path.isfile(path):
-                    path = os.path.join(yaml_dir, path)
-
+                    path = os.path.normpath(os.path.join(yaml_dir, path))
                try:
                    included_yaml_config = load_yaml_config(path)
                    final_yaml_config.update(included_yaml_config)

--- a/main.py
+++ b/main.py
@@ -11,7 +11,6 @@ from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
 from lm_eval.tasks import include_task_folder
-from lm_eval.benchmarks import include_benchmarks

 os.environ["TOKENIZERS_PARALLELISM"] = "false"

@@ -209,8 +208,8 @@ def main() -> None:
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))
-        if "aggregate" in results:
-            print(evaluator.make_table(results, "aggregate"))
+        if "groups" in results:
+            print(evaluator.make_table(results, "groups"))


 if __name__ == "__main__":

--- a/mypy.ini
+++ b/mypy.ini
 [mypy]
-python_version = 3.9
+python_version = 3.8
 show_traceback = True
 check_untyped_defs = True
 no_implicit_reexport = True

--- a/pyproject.toml
+++ b/pyproject.toml
 [build-system]
 requires = ["setuptools>=40.8.0", "wheel"]
 build-backend = "setuptools.build_meta"
+
+[project]
+name = "lm_eval"
+version = "1.0.0"
+authors = [
+    {name="EleutherAI", email="contact@eleuther.ai"}
+]
+description = "A framework for evaluating language models"
+readme = "README.md"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+requires-python = ">=3.8"
+license = { "text" = "MIT" }
+dependencies = [
+    "accelerate>=0.21.0",
+    "evaluate",
+    "datasets>=2.0.0",
+    "evaluate>=0.4.0",
+    "jsonlines",
+    "numexpr",
+    "peft>=0.2.0",
+    "pybind11>=2.6.2",
+    "pytablewriter",
+    "rouge-score>=0.0.4",
+    "sacrebleu>=1.5.0",
+    "scikit-learn>=0.24.1",
+    "sqlitedict",
+    "torch>=1.8",
+    "tqdm-multiprocess",
+    "transformers>=4.1",
+    "zstandard",
+]
+
+[tool.setuptools]
+packages = ["lm_eval"]
+
+# required to include yaml files in pip installation
+[tool.setuptools.package-data]
+lm_eval = ["**/*.yaml", "tasks/**/*"]
+examples = ["**/*.yaml"]
+
+[project.scripts]
+lm-eval = "main:main"
+lm_eval = "main:main"
+
+[project.urls]
+Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
+Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
+
+[project.optional-dependencies]
+dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"]
+linting = [
+    "flake8",
+    "pylint",
+    "mypy",
+    "pre-commit",
+]
+testing = ["pytest", "pytest-cov", "pytest-xdist"]
+multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
+math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
+sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
+promptsource = [
+    "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
+]
+gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
+anthropic = ["anthropic"]
+openai = ["openai", "tiktoken"]
+all = [
+    "lm_eval[dev]",
+    "lm_eval[testing]",
+    "lm_eval[linting]",
+    "lm_eval[multilingual]",
+    "lm_eval[sentencepiece]",
+    "lm_eval[promptsource]",
+    "lm_eval[gptq]",
+    "lm_eval[anthropic]",
+    "lm_eval[openai]"
+]
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -38,17 +38,21 @@ def main():
        iters = []

        for set in args.sets.split(","):
+            docs = None
            if set == "train" and task.has_training_docs():
                docs = task.training_docs()
            if set == "val" and task.has_validation_docs():
                docs = task.validation_docs()
            if set == "test" and task.has_test_docs():
                docs = task.test_docs()
-            iters.append(docs)
+            if docs is not None:
+                iters.append(docs)

        docs = join_iters(iters)

-        with open(os.path.join(args.output_base_path, task_name), "w") as f:
+        with open(
+            os.path.join(args.output_base_path, task_name), "w", encoding="utf8"
+        ) as f:
            for i, doc in (
                zip(range(args.num_examples), docs)
                if args.num_examples > 0

--- a/setup.py
+++ b/setup.py
 import setuptools
-import itertools

-with open("README.md", "r", encoding="utf-8") as fh:
-    long_description = fh.read()
-
-
-extras_require = {
-    "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
-    "linting": [
-        "flake8",
-        "pylint",
-        "mypy",
-        "pre-commit",
-    ],
-    "testing": ["pytest", "pytest-cov", "pytest-xdist"],
-    "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
-    "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1", "pycountry"],
-    "promptsource": [
-        "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
-    ],
-    "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
-    "anthropic": ["anthropic"],
-    "openai": ["openai", "tiktoken"],
-}
-extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
-
-
-setuptools.setup(
-    name="lm_eval",
-    version="1.0.0",
-    author="EleutherAI",
-    author_email="contact@eleuther.ai",
-    description="A framework for evaluating language models",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/EleutherAI/lm-evaluation-harness",
-    packages=setuptools.find_packages(),
-    # required to include yaml files in pip installation
-    package_data={
-        "lm_eval": ["**/*.yaml", "tasks/**/*"],
-        "examples": ["**/*.yaml"],
-    },
-    entry_points={
-        "console_scripts": ["lm-eval = main:main", "lm_eval = main:main"],
-    },
-    include_package_data=True,
-    classifiers=[
-        "Development Status :: 3 - Alpha",
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-    ],
-    python_requires=">=3.9",
-    install_requires=[
-        "accelerate>=0.21.0",
-        "evaluate",
-        "datasets>=2.0.0",
-        "evaluate>=0.4.0",
-        "jsonlines",
-        "numexpr",
-        "omegaconf>=2.2",
-        "peft>=0.2.0",
-        "pybind11>=2.6.2",
-        "pytablewriter",
-        "rouge-score>=0.0.4",
-        "sacrebleu>=1.5.0",
-        "scikit-learn>=0.24.1",
-        "sqlitedict",
-        "torch>=1.8",
-        "tqdm-multiprocess",
-        "transformers>=4.1",
-        "zstandard",
-    ],
-    extras_require=extras_require,
-)
+# This is to make sure that the package supports editable installs
+setuptools.setup()
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
+from typing import List
 import random
 import pytest

@@ -26,7 +27,7 @@ import pytest
        )
    ],
 )
-def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
+def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
    task_name = task_name
    limit = 10