Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into squadv2

3263c572 · lintangsutawika · a27e8ed1 · 33d52483 · 3263c572 · 3263c572
Commit 3263c572 authored Sep 18, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-rte-t5-prompt
+dataset_path: super_glue
+dataset_name: rte
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
+doc_to_target: label
+doc_to_choice: ['entailment', 'not_entailment']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-wic-t5-prompt
+dataset_path: super_glue
+dataset_name: wic
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
+doc_to_target: label
+doc_to_choice: ['False', 'True']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wsc/default.yaml
+++ b/lm_eval/tasks/super_glue/wsc/default.yaml
@@ -2,7 +2,7 @@ group:
  - super-glue-lm-eval-v1
 task: wsc
 dataset_path: super_glue
-dataset_name: wsc
+dataset_name: wsc.fixed
 output_type: multiple_choice
 training_split: train
 validation_split: validation

--- a/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
+++ b/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
-import re
 from lm_eval.utils import general_detokenize
-def t5_prompt_doc_to_text(x):
-    def _mark_span(text, span_str, span_idx, mark):
-        pattern_tmpl = r"^((?:\S+\s){N})(W)"
-        pattern = re.sub("N", str(span_idx), pattern_tmpl)
-        pattern = re.sub("W", span_str, pattern)
-        return re.sub(pattern, r"\1{0} \2 {0}".format(mark), text)
-    text = x["text"]
-    text = _mark_span(text, x["span1_text"], x["span1_index"], "*")
-    # Compensate for 2 added "words" added in previous step.
-    span2_index = x["span2_index"] + 2 * (x["span1_index"] < x["span2_index"])
-    text = _mark_span(text, x["span2_text"], span2_index, "#")
-    return text
 def default_doc_to_text(x):
    raw_passage = x["text"]
    # NOTE: HuggingFace span indices are word-based not character-based.

--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -2,16 +2,17 @@ group:
  - super-glue-t5-prompt
 task: super_glue-wsc-t5-prompt
 dataset_path: super_glue
-dataset_name: wsc
+dataset_name: wsc.fixed
 training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: !function "preprocess_wsc.t5_prompt_doc_to_text"
+doc_to_text: !function "t5_utils.doc_to_text"
 doc_to_target: label
-doc_to_choice: ['False', 'True']
 metric_list:
-  - metric: exact_match
+  - metric: accuracy
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
+filter_list:
-    ignore_punctuation: true
+  - name: "wsc_postprocessor"
+    filter:
+      - function: !function t5_utils.WSCPostprocess
--- a/lm_eval/tasks/super_glue/wsc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/wsc/t5_utils.py
+import re
+from lm_eval.api.filter import Filter
+def doc_to_text(x):
+    text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
+    return "wsc: " + text
+def _wsc_inputs(x):
+    words = x["text"].split(" ")
+    # We would need some special logic to handle the case where the pronoun is the
+    # first or last word in the text. None of the examples in WSC seem to have
+    # this, so we are ignoring these cases.
+    assert x["span2_index"] > 0
+    assert x["span2_index"] < len(words)
+    pronoun_index = x["span2_index"]
+    def create_input():
+        assert words[pronoun_index] == x["span2_text"]
+        return " ".join(
+            [
+                " ".join(words[:pronoun_index]),
+                "X",
+                " ".join(words[pronoun_index + 1 :]),
+            ]
+        )
+    # Handle some special cases.
+    if (
+        x["text"]
+        == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
+    ):
+        return (
+            "The boy continued to whip the pony , and eventually the pony threw "
+            'him over. John laughed out quite loud. "Good for X ," he said.'
+        )
+    # Using the span2_index, we get 'use' instead of 'it'.
+    if (
+        x["text"]
+        == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
+    ):
+        return (
+            "When they had eventually calmed down a bit , and had gotten home, "
+            "Mr. Farley put the magic pebble in an iron safe . Some day they might "
+            "want to use X , but really for now, what more could they wish for?"
+        )
+    return create_input()
+class WSCPostprocess(Filter):
+    def __init__(self, **kwargs):
+        self.determiners = {
+            "a",
+            "an",
+            "few",
+            "her",
+            "his",
+            "each",
+            "every",
+            "many",
+            "much",
+            "my",
+            "our",
+            "some",
+            "that",
+            "the",
+            "their",
+            "these",
+            "this",
+            "those",
+            "which",
+            "whose",
+            "your",
+        }
+    def clean(self, s):
+        """Ignore capitalization and determiners."""
+        s = s.strip().lower()
+        return " ".join([w for w in s.split(" ") if w not in self.determiners])
+    def apply(self, resps, docs):
+        filtered_resps = []
+        for prediction, reference in zip(*(resps, docs["span1_text"])):
+            prediction = self.clean(prediction[0])
+            reference = self.clean(reference)
+            if ("'" in prediction) != ("'" in reference):
+                # referent is "Bob's hat" as predicting the referent.
+                predicted_referent = False
+            else:
+                prediction_words = set(prediction.split(" "))
+                referent_words = set(reference.split(" "))
+                # Handle cases where the prediction is "fuzzy bunny" and the referent is
+                # "bunny".
+                predicted_referent = prediction_words.issubset(
+                    referent_words
+                ) or referent_words.issubset(prediction_words)
+            filtered_resps.append(predicted_referent)
+        return filtered_resps
--- a/lm_eval/tasks/translation/README.md
+++ b/lm_eval/tasks/translation/README.md
+# Translation Tasks
+### Paper
+### Citation
+```
+```
+### Groups and Tasks
+#### Groups
+* `gpt3_translation_tasks`
+* `wmt14`
+* `wmt16`
+* `wmt20`
+* `iwslt2017`
+#### Tasks
+*
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+  * [ ] Checked for equivalence with v0.3.0 LM Evaluation Harness
--- a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+# Generated by utils.py
+dataset_name: iwslt2017-en-ar
+dataset_path: iwslt2017
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'Arabic phrase: {{translation["ar"]}}
+  English phrase:'
+group:
+- greedy_until
+- translation
+- iwslt2017
+include: wmt_common_yaml
+task: iwslt2017-ar-en
--- a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+# Generated by utils.py
+dataset_name: iwslt2017-en-ar
+dataset_path: iwslt2017
+doc_to_target: ' {{translation["ar"]}}'
+doc_to_text: 'English phrase: {{translation["en"]}}
+  Arabic phrase:'
+group:
+- greedy_until
+- translation
+- iwslt2017
+include: wmt_common_yaml
+task: iwslt2017-en-ar
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
+import argparse
+from typing import Dict, List
+import yaml
+import sacrebleu
+try:
+    import pycountry
+except ModuleNotFoundError:
+    raise Exception(
+        "`pycountry` is required for generating translation task prompt templates. \
+please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]",
+    )
+# Different translation benchmarks included in the library. Mostly WMT.
+# These correspond to dataset names (subsets) on HuggingFace for each dataset.
+# A yaml file is generated by this script for each language pair.
+gpt3_translation_benchmarks = {
+    "wmt14": ["fr-en"],  # ["en-fr", "fr-en"],  # French
+    "wmt16": [
+        "ro-en",
+        "de-en",
+    ],  # ["en-ro", "ro-en", "de-en", "en-de"],  # German, Romanian
+}
+# 28 total
+LANGUAGES = {
+    **gpt3_translation_benchmarks,
+    # "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
+    "iwslt2017": ["en-ar"],  # Arabic
+}
+def code_to_language(code):
+    # key is alpha_2 or alpha_3 depending on the code length
+    language_tuple = pycountry.languages.get(**{f"alpha_{len(code)}": code})
+    return language_tuple.name
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        for dataset_name in LANGUAGES[lang]:
+            src_lang, _, tgt_lang = dataset_name.partition("-")
+            for src, tgt in [[src_lang, tgt_lang], [tgt_lang, src_lang]]:
+                # both translation directions for each lang pair
+                lang_pair = src + "-" + tgt
+                file_name = f"{lang}_{lang_pair}.yaml"
+                try:
+                    source, target = code_to_language(src), code_to_language(tgt)
+                    groups = ["greedy_until", "translation", lang]
+                    if lang in gpt3_translation_benchmarks.keys():
+                        groups += ["gpt3_translation_benchmarks"]
+                    with open(
+                        f"{output_dir}/{file_name}",
+                        "w" if overwrite else "x",
+                        encoding="utf8",
+                    ) as f:
+                        f.write("# Generated by utils.py\n")
+                        yaml.dump(
+                            {
+                                "include": "wmt_common_yaml",
+                                "group": groups,
+                                "dataset_path": lang,
+                                "dataset_name": dataset_name
+                                if not (lang == "iwslt2017")
+                                else "iwslt2017-" + dataset_name,
+                                "task": f"{lang}-{lang_pair}",
+                                "doc_to_text": f"{source} phrase: "
+                                + "{{translation["
+                                + f'"{src}"'
+                                + "]}}\n"
+                                + f"{target} phrase:",
+                                "doc_to_target": " {{"
+                                + "translation["
+                                + f'"{tgt}"]'
+                                + "}}",
+                            },
+                            f,
+                        )
+                except FileExistsError:
+                    err.append(file_name)
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/translation/wmt14_en-fr.yaml
+++ b/lm_eval/tasks/translation/wmt14_en-fr.yaml
+# Generated by utils.py
+dataset_name: fr-en
+dataset_path: wmt14
+doc_to_target: ' {{translation["fr"]}}'
+doc_to_text: 'English phrase: {{translation["en"]}}
+  French phrase:'
+group:
+- greedy_until
+- translation
+- wmt14
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt14-en-fr
--- a/lm_eval/tasks/translation/wmt14_fr-en.yaml
+++ b/lm_eval/tasks/translation/wmt14_fr-en.yaml
+# Generated by utils.py
+dataset_name: fr-en
+dataset_path: wmt14
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'French phrase: {{translation["fr"]}}
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt14
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt14-fr-en
--- a/lm_eval/tasks/translation/wmt16_de-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_de-en.yaml
+# Generated by utils.py
+dataset_name: de-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'German phrase: {{translation["de"]}}
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-de-en
--- a/lm_eval/tasks/translation/wmt16_en-de.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-de.yaml
+# Generated by utils.py
+dataset_name: de-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["de"]}}'
+doc_to_text: 'English phrase: {{translation["en"]}}
+  German phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-en-de
--- a/lm_eval/tasks/translation/wmt16_en-ro.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-ro.yaml
+# Generated by utils.py
+dataset_name: ro-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["ro"]}}'
+doc_to_text: 'English phrase: {{translation["en"]}}
+  Romanian phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-en-ro
--- a/lm_eval/tasks/translation/wmt16_ro-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_ro-en.yaml
+# Generated by utils.py
+dataset_name: ro-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'Romanian phrase: {{translation["ro"]}}
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-ro-en
--- a/lm_eval/tasks/translation/wmt_common_yaml
+++ b/lm_eval/tasks/translation/wmt_common_yaml
+output_type: greedy_until
+training_split: train
+validation_split: validation
+fewshot_split: validation
+test_split: test
+metric_list:
+  - metric: bleu
+  - metric: ter
+  - metric: chrf
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
--- a/lm_eval/tasks/wikitext/preprocess_wikitext.py
+++ b/lm_eval/tasks/wikitext/preprocess_wikitext.py
@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc):
    string = string.replace(" 's", "'s")
    return string
+def process_results(doc, results):
+    (loglikelihood,) = results
+    # IMPORTANT: wikitext counts number of words in *original doc before detokenization*
+    _words = len(re.split(r"\s+", doc["page"]))
+    _bytes = len(doc["page"].encode("utf-8"))
+    return {
+        "word_perplexity": (loglikelihood, _words),
+        "byte_perplexity": (loglikelihood, _bytes),
+        "bits_per_byte": (loglikelihood, _bytes),
+    }
--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
@@ -7,6 +7,7 @@ validation_split: validation
 test_split: test
 doc_to_text: ""
 doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
+process_results: !function preprocess_wikitext.process_results
 should_decontaminate: true
 doc_to_decontamination_query: "{{page}}"
 metric_list:

--- a/lm_eval/tasks/wsc273/README.md
+++ b/lm_eval/tasks/wsc273/README.md
+# WSC273
+### Paper
+Title: `The Winograd Schema Challenge`
+Abstract: http://commonsensereasoning.org/2011/papers/Levesque.pdf
+A Winograd schema is a pair of sentences that differ in only one or two words
+and that contain an ambiguity that is resolved in opposite ways in the two
+sentences and requires the use of world knowledge and reasoning for its resolution.
+The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
+NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
+as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.0
+Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
+### Citation
+```
+@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
+    title = "The winograd schema challenge",
+    abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
+    author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
+    year = "2012",
+    language = "English (US)",
+    isbn = "9781577355601",
+    series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
+    publisher = "Institute of Electrical and Electronics Engineers Inc.",
+    pages = "552--561",
+    booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
+    note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
+}
+```
+### Groups and Tasks
+#### Groups
+* Not part of any group yet.
+#### Tasks
+* `wsc273`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?