Merge remote-tracking branch 'origin/big-refactor' into nqopen_baber

# Conflicts: # lm_eval/api/task.py

Merge remote-tracking branch 'origin/big-refactor' into nqopen_baber
# Conflicts: # lm_eval/api/task.py
a07d05f7 · baberabb · b1d468f2 · 6ba2a2b0 · a07d05f7 · a07d05f7
Commit a07d05f7 authored Sep 14, 2023 by baberabb
20 changed files
--- a/lm_eval/tasks/super_glue/record/t5_utils.py
+++ b/lm_eval/tasks/super_glue/record/t5_utils.py
+import re
+import string
+import collections
+import numpy as np
+
+from tqdm import tqdm
+from datasets import Dataset, concatenate_datasets
+
+from lm_eval.api.metrics import metric_max_over_ground_truths
+
+
+def doc_to_text(doc):
+
+    passage = doc["passage"]
+    passage = re.sub(r"(\.|\?|\!|\"|\')\n@highlight\n", r"\1 ", passage)
+    passage = re.sub(r"\n@highlight\n", ". ", passage)
+
+    return " ".join(
+        [
+            "record query:",
+            doc["query"],
+            "entities:",
+            ", ".join(doc["entities"]),
+            "passage:",
+            passage,
+        ]
+    )
+
+
+def process_docs(dataset):
+    def split_answers(doc):
+        split_doc = {
+            **{k: [] for k in doc.keys()},
+        }
+        answers = doc.pop("answers")
+        for idx, answer in enumerate(answers):
+
+            for key in split_doc.keys():
+                if key in doc:
+                    split_doc[key].append(doc[key])
+
+            split_doc["answers"].append(answer)
+        return split_doc
+
+    dataset = dataset.map(split_answers)
+    new_dataset = {}
+    for key in dataset.features.keys():
+        new_dataset[key] = [x for row in dataset[key] for x in row]
+
+    return Dataset.from_dict(new_dataset)
+
+
+def normalize_squad(answer):
+    """Normalization used in official SQuAD evaluation script."""
+
+    def _normalize_answer(text, punc_chars, punc_repl):
+        """Lower text and remove punctuation, articles and extra whitespace."""
+
+        def remove_articles(s):
+            return re.sub(r"\b(a|an|the)\b", " ", s)
+
+        def replace_punctuation(s):
+            to_replace = set(punc_chars)
+            return "".join(punc_repl if ch in to_replace else ch for ch in s)
+
+        def white_space_fix(s):
+            return " ".join(s.split())
+
+        text = text.lower()
+        text = replace_punctuation(text)
+        text = remove_articles(text)
+        text = white_space_fix(text)
+
+        return text
+
+    return _normalize_answer(answer, punc_chars=string.punctuation, punc_repl="")
+
+
+def em(predictions, references):  # This is a passthrough function
+    return (predictions[0], references[0])
+
+
+def f1(predictions, references):  # This is a passthrough function
+    return (predictions[0], references[0])
+
+
+def squad_em_agg(items):
+    def _exact_match_score(prediction, target):
+        return target == prediction
+
+    grouped_values = collections.defaultdict(lambda: ([], []))
+    for prediction, reference in items:
+        group, reference = reference.split("_")
+        # if group not in grouped_values:
+        grouped_values[group][0].append(normalize_squad(prediction))
+        grouped_values[group][1].append(normalize_squad(reference))
+
+    em = []
+    for group in grouped_values.keys():
+        predictions, targets = grouped_values[group]
+        for p in predictions:
+            em.append(metric_max_over_ground_truths(_exact_match_score, p, targets))
+
+    return np.mean(em)
+
+
+def squad_f1_agg(items):
+    def _f1_score(prediction, target):
+        """Computes token f1 score for a single target and prediction."""
+        prediction_tokens = prediction.split()
+        target_tokens = target.split()
+        common = collections.Counter(prediction_tokens) & collections.Counter(
+            target_tokens
+        )
+        num_same = sum(common.values())
+        if num_same == 0:
+            return 0
+        precision = 1.0 * num_same / len(prediction_tokens)
+        recall = 1.0 * num_same / len(target_tokens)
+        f1 = (2 * precision * recall) / (precision + recall)
+        return f1
+
+    grouped_values = collections.defaultdict(lambda: ([], []))
+    for prediction, reference in items:
+        group, reference = reference.split("_")
+        if group not in grouped_values:
+            grouped_values[group][0].append(normalize_squad(prediction))
+        grouped_values[group][1].append(normalize_squad(reference))
+
+    f1 = []
+    for group in grouped_values.keys():
+        p, t = grouped_values[group]
+        f1.append(metric_max_over_ground_truths(_f1_score, p[0], t))
+
+    return np.mean(f1)
--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-rte-t5-prompt
+dataset_path: super_glue
+dataset_name: rte
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
+doc_to_target: label
+doc_to_choice: ['entailment', 'not_entailment']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-wic-t5-prompt
+dataset_path: super_glue
+dataset_name: wic
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
+doc_to_target: label
+doc_to_choice: ['False', 'True']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wsc/default.yaml
+++ b/lm_eval/tasks/super_glue/wsc/default.yaml
@@ -2,7 +2,7 @@ group:
  - super-glue-lm-eval-v1
 task: wsc
 dataset_path: super_glue
-dataset_name: wsc
+dataset_name: wsc.fixed
 output_type: multiple_choice
 training_split: train
 validation_split: validation

--- a/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
+++ b/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
-import re
 from lm_eval.utils import general_detokenize


-def t5_prompt_doc_to_text(x):
-    def _mark_span(text, span_str, span_idx, mark):
-        pattern_tmpl = r"^((?:\S+\s){N})(W)"
-        pattern = re.sub("N", str(span_idx), pattern_tmpl)
-        pattern = re.sub("W", span_str, pattern)
-        return re.sub(pattern, r"\1{0} \2 {0}".format(mark), text)
-
-    text = x["text"]
-    text = _mark_span(text, x["span1_text"], x["span1_index"], "*")
-    # Compensate for 2 added "words" added in previous step.
-    span2_index = x["span2_index"] + 2 * (x["span1_index"] < x["span2_index"])
-    text = _mark_span(text, x["span2_text"], span2_index, "#")
-
-    return text
-
-
 def default_doc_to_text(x):
    raw_passage = x["text"]
    # NOTE: HuggingFace span indices are word-based not character-based.

--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -2,16 +2,17 @@ group:
  - super-glue-t5-prompt
 task: super_glue-wsc-t5-prompt
 dataset_path: super_glue
-dataset_name: wsc
+dataset_name: wsc.fixed
 training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: !function "preprocess_wsc.t5_prompt_doc_to_text"
+doc_to_text: !function "t5_utils.doc_to_text"
 doc_to_target: label
-doc_to_choice: ['False', 'True']
 metric_list:
-  - metric: exact_match
+  - metric: accuracy
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+filter_list:
+  - name: "wsc_postprocessor"
+    filter:
+      - function: !function t5_utils.WSCPostprocess
--- a/lm_eval/tasks/super_glue/wsc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/wsc/t5_utils.py
+import re
+from lm_eval.api.filter import Filter
+
+
+def doc_to_text(x):
+    text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
+    return "wsc: " + text
+
+
+def _wsc_inputs(x):
+
+    words = x["text"].split(" ")
+
+    # We would need some special logic to handle the case where the pronoun is the
+    # first or last word in the text. None of the examples in WSC seem to have
+    # this, so we are ignoring these cases.
+    assert x["span2_index"] > 0
+    assert x["span2_index"] < len(words)
+    pronoun_index = x["span2_index"]
+
+    def create_input():
+        assert words[pronoun_index] == x["span2_text"]
+
+        return " ".join(
+            [
+                " ".join(words[:pronoun_index]),
+                "X",
+                " ".join(words[pronoun_index + 1 :]),
+            ]
+        )
+
+    # Handle some special cases.
+    if (
+        x["text"]
+        == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
+    ):
+        return (
+            "The boy continued to whip the pony , and eventually the pony threw "
+            'him over. John laughed out quite loud. "Good for X ," he said.'
+        )
+
+    # Using the span2_index, we get 'use' instead of 'it'.
+    if (
+        x["text"]
+        == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
+    ):
+        return (
+            "When they had eventually calmed down a bit , and had gotten home, "
+            "Mr. Farley put the magic pebble in an iron safe . Some day they might "
+            "want to use X , but really for now, what more could they wish for?"
+        )
+
+    return create_input()
+
+
+class WSCPostprocess(Filter):
+    def __init__(self, **kwargs):
+
+        self.determiners = {
+            "a",
+            "an",
+            "few",
+            "her",
+            "his",
+            "each",
+            "every",
+            "many",
+            "much",
+            "my",
+            "our",
+            "some",
+            "that",
+            "the",
+            "their",
+            "these",
+            "this",
+            "those",
+            "which",
+            "whose",
+            "your",
+        }
+
+    def clean(self, s):
+        """Ignore capitalization and determiners."""
+        s = s.strip().lower()
+        return " ".join([w for w in s.split(" ") if w not in self.determiners])
+
+    def apply(self, resps, docs):
+
+        filtered_resps = []
+        for prediction, reference in zip(*(resps, docs["span1_text"])):
+
+            prediction = self.clean(prediction[0])
+            reference = self.clean(reference)
+
+            if ("'" in prediction) != ("'" in reference):
+                # referent is "Bob's hat" as predicting the referent.
+                predicted_referent = False
+            else:
+                prediction_words = set(prediction.split(" "))
+                referent_words = set(reference.split(" "))
+
+                # Handle cases where the prediction is "fuzzy bunny" and the referent is
+                # "bunny".
+                predicted_referent = prediction_words.issubset(
+                    referent_words
+                ) or referent_words.issubset(prediction_words)
+
+            filtered_resps.append(predicted_referent)
+
+        return filtered_resps
--- a/lm_eval/tasks/translation/README.md
+++ b/lm_eval/tasks/translation/README.md
+# Translation Tasks
+
+### Paper
+
+
+
+### Citation
+
+```
+
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `gpt3_translation_tasks`
+* `wmt14`
+* `wmt16`
+* `wmt20`
+* `iwslt2017`
+
+#### Tasks
+
+*
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+  * [ ] Checked for equivalence with v0.3.0 LM Evaluation Harness
--- a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+# Generated by utils.py
+dataset_name: iwslt2017-en-ar
+dataset_path: iwslt2017
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'Arabic phrase: {{translation["ar"]}}
+
+  English phrase:'
+group:
+- greedy_until
+- translation
+- iwslt2017
+include: wmt_common_yaml
+task: iwslt2017-ar-en
--- a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+# Generated by utils.py
+dataset_name: iwslt2017-en-ar
+dataset_path: iwslt2017
+doc_to_target: ' {{translation["ar"]}}'
+doc_to_text: 'English phrase: {{translation["en"]}}
+
+  Arabic phrase:'
+group:
+- greedy_until
+- translation
+- iwslt2017
+include: wmt_common_yaml
+task: iwslt2017-en-ar
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
+import argparse
+from typing import Dict, List
+
+import yaml
+
+import sacrebleu
+
+try:
+    import pycountry
+except ModuleNotFoundError:
+    raise Exception(
+        "`pycountry` is required for generating translation task prompt templates. \
+please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
+    )
+
+
+# Different translation benchmarks included in the library. Mostly WMT.
+# These correspond to dataset names (subsets) on HuggingFace for each dataset.
+# A yaml file is generated by this script for each language pair.
+
+gpt3_translation_benchmarks = {
+    "wmt14": ["fr-en"],  # ["en-fr", "fr-en"],  # French
+    "wmt16": [
+        "ro-en",
+        "de-en",
+    ],  # ["en-ro", "ro-en", "de-en", "en-de"],  # German, Romanian
+}
+
+# 28 total
+LANGUAGES = {
+    **gpt3_translation_benchmarks,
+    # "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
+    "iwslt2017": ["en-ar"],  # Arabic
+}
+
+
+def code_to_language(code):
+    # key is alpha_2 or alpha_3 depending on the code length
+    language_tuple = pycountry.languages.get(**{f"alpha_{len(code)}": code})
+    return language_tuple.name
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        for dataset_name in LANGUAGES[lang]:
+            src_lang, _, tgt_lang = dataset_name.partition("-")
+            for src, tgt in [[src_lang, tgt_lang], [tgt_lang, src_lang]]:
+                # both translation directions for each lang pair
+                lang_pair = src + "-" + tgt
+                file_name = f"{lang}_{lang_pair}.yaml"
+                try:
+                    source, target = code_to_language(src), code_to_language(tgt)
+
+                    groups = ["greedy_until", "translation", lang]
+                    if lang in gpt3_translation_benchmarks.keys():
+                        groups += ["gpt3_translation_benchmarks"]
+
+                    with open(
+                        f"{output_dir}/{file_name}",
+                        "w" if overwrite else "x",
+                        encoding="utf8",
+                    ) as f:
+                        f.write("# Generated by utils.py\n")
+                        yaml.dump(
+                            {
+                                "include": "wmt_common_yaml",
+                                "group": groups,
+                                "dataset_path": lang,
+                                "dataset_name": dataset_name
+                                if not (lang == "iwslt2017")
+                                else "iwslt2017-" + dataset_name,
+                                "task": f"{lang}-{lang_pair}",
+                                "doc_to_text": f"{source} phrase: "
+                                + "{{translation["
+                                + f'"{src}"'
+                                + "]}}\n"
+                                + f"{target} phrase:",
+                                "doc_to_target": " {{"
+                                + "translation["
+                                + f'"{tgt}"]'
+                                + "}}",
+                            },
+                            f,
+                        )
+                except FileExistsError:
+                    err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/translation/wmt14_en-fr.yaml
+++ b/lm_eval/tasks/translation/wmt14_en-fr.yaml
+# Generated by utils.py
+dataset_name: fr-en
+dataset_path: wmt14
+doc_to_target: ' {{translation["fr"]}}'
+doc_to_text: 'English phrase: {{translation["en"]}}
+
+  French phrase:'
+group:
+- greedy_until
+- translation
+- wmt14
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt14-en-fr
--- a/lm_eval/tasks/translation/wmt14_fr-en.yaml
+++ b/lm_eval/tasks/translation/wmt14_fr-en.yaml
+# Generated by utils.py
+dataset_name: fr-en
+dataset_path: wmt14
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'French phrase: {{translation["fr"]}}
+
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt14
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt14-fr-en
--- a/lm_eval/tasks/translation/wmt16_de-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_de-en.yaml
+# Generated by utils.py
+dataset_name: de-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'German phrase: {{translation["de"]}}
+
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-de-en
--- a/lm_eval/tasks/translation/wmt16_en-de.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-de.yaml
+# Generated by utils.py
+dataset_name: de-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["de"]}}'
+doc_to_text: 'English phrase: {{translation["en"]}}
+
+  German phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-en-de
--- a/lm_eval/tasks/translation/wmt16_en-ro.yaml
+++ b/lm_eval/tasks/translation/wmt16_en-ro.yaml
+# Generated by utils.py
+dataset_name: ro-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["ro"]}}'
+doc_to_text: 'English phrase: {{translation["en"]}}
+
+  Romanian phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-en-ro
--- a/lm_eval/tasks/translation/wmt16_ro-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_ro-en.yaml
+# Generated by utils.py
+dataset_name: ro-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'Romanian phrase: {{translation["ro"]}}
+
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-ro-en
--- a/lm_eval/tasks/translation/wmt_common_yaml
+++ b/lm_eval/tasks/translation/wmt_common_yaml
+output_type: greedy_until
+training_split: train
+validation_split: validation
+fewshot_split: validation
+test_split: test
+metric_list:
+  - metric: bleu
+  - metric: ter
+  - metric: chrf
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
--- a/lm_eval/tasks/wikitext/preprocess_wikitext.py
+++ b/lm_eval/tasks/wikitext/preprocess_wikitext.py
@@ -34,3 +34,15 @@ def wikitext_detokenizer(doc):
    string = string.replace(" 's", "'s")

    return string
+
+
+def process_results(doc, results):
+    (loglikelihood,) = results
+    # IMPORTANT: wikitext counts number of words in *original doc before detokenization*
+    _words = len(re.split(r"\s+", doc["page"]))
+    _bytes = len(doc["page"].encode("utf-8"))
+    return {
+        "word_perplexity": (loglikelihood, _words),
+        "byte_perplexity": (loglikelihood, _bytes),
+        "bits_per_byte": (loglikelihood, _bytes),
+    }
--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
@@ -7,6 +7,7 @@ validation_split: validation
 test_split: test
 doc_to_text: ""
 doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
+process_results: !function preprocess_wikitext.process_results
 should_decontaminate: true
 doc_to_decontamination_query: "{{page}}"
 metric_list: