Merge branch 'big-refactor' into wmt

3b4fa26e · Lintang Sutawika · GitHub · d01cc479 · 8f448eed · 3b4fa26e
Unverified Commit 3b4fa26e authored Sep 05, 2023 by Lintang Sutawika Committed by GitHub Sep 05, 2023
20 changed files
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -69,7 +69,7 @@ class OpenaiCompletionsLM(LM):
        engine: str = "text-davinci-003",
        truncate: bool = False,
        batch_size: int = 1,
-    ):
+    ) -> None:
        """
        :param engine: str
@@ -99,12 +99,12 @@ class OpenaiCompletionsLM(LM):
        return self.end_of_text_token_id
    @property
-    def max_length(self):
+    def max_length(self) -> int:
        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
        return 2048
    @property
-    def max_gen_toks(self):
+    def max_gen_toks(self) -> int:
        return 256
    @property
@@ -152,7 +152,7 @@ class OpenaiCompletionsLM(LM):
        return self._loglikelihood_tokens(new_reqs)
    def _loglikelihood_tokens(
-        self, requests, disable_tqdm=False
+        self, requests, disable_tqdm: bool = False
    ) -> List[Tuple[float, bool]]:
        res = []

--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -41,7 +41,7 @@ def textsynth_completion(**kwargs):
 @register_model("textsynth")
 class TextSynthLM(LM):
-    def __init__(self, engine, truncate=False):
+    def __init__(self, engine, truncate: bool = False) -> None:
        """
        :param engine: str
            TextSynth API engine (e.g. `gptj_6B`)
@@ -62,12 +62,12 @@ class TextSynthLM(LM):
        raise NotImplementedError()
    @property
-    def max_length(self):
+    def max_length(self) -> int:
        # NOTE: Turn on truncation to avoid errors on long inputs.
        return 2048
    @property
-    def max_gen_toks(self):
+    def max_gen_toks(self) -> int:
        return 256
    @property

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -5,7 +5,7 @@ from lm_eval.logger import eval_logger
 # Stores prompts in a dictionary indexed by 2 levels:
 # prompt category name, and prompt name.
 # This allows us to access prompts
-PROMPT_REGISTRY = {
+PROMPT_REGISTRY: dict[str, dict[str, str]] = {
    "qa-basic": {
        "question-newline-answer": "Question: {{question}}\nAnswer:",
        "q-newline-a": "Q: {{question}}\nA:",
@@ -13,7 +13,7 @@ PROMPT_REGISTRY = {
 }
-def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
+def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None):
    # unpack prompt name
    category_name, prompt_name = prompt_id.split(":")
    if subset_name is None:

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -5,8 +5,8 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] Glue
 - [x] SuperGlue
- [ ] CoQA (Lintang)
+- [x] CoQA
- [ ] DROP (Lintang)
+- [x] DROP
 - [x] ~~Lambada~~
 - [x] Lambada (Cloze variants)
 - [x] ~~Lambada (Multilingual)~~
@@ -29,7 +29,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] HeadQA
 - [x] MathQA
 - [x] WebQs
- [ ] WSC273 (Lintang)
+- [x] WSC273
 - [x] Winogrande
 - [x] ANLI
 - [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
@@ -38,7 +38,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] TruthfulQA (gen)
 - [ ] MuTual
 - [ ] Hendrycks Math (Hailey)
- [ ] Asdiv
+- [x] Asdiv
 - [ ] GSM8k
 - [x] Arithmetic
 - [ ] MMMLU (Hailey)

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -15,7 +15,7 @@ from lm_eval.api.registry import (
 )
-def register_configurable_task(config):
+def register_configurable_task(config: dict[str, str]) -> int:
    SubClass = type(
        config["task"] + "ConfigurableTask",
        (ConfigurableTask,),
@@ -38,7 +38,7 @@ def register_configurable_task(config):
    return 0
-def check_prompt_config(config):
+def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
    all_configs = []
    if "use_prompt" in config:
        prompt_list = prompts.load_prompt_list(
@@ -69,14 +69,14 @@ def check_prompt_config(config):
    return all_configs
-def get_task_name_from_config(task_config):
+def get_task_name_from_config(task_config: dict[str, str]) -> str:
    if "dataset_name" in task_config:
        return "{dataset_path}_{dataset_name}".format(**task_config)
    else:
        return "{dataset_path}".format(**task_config)
-def include_task_folder(task_dir):
+def include_task_folder(task_dir: str) -> None:
    """
    Calling this function
    """

--- a/lm_eval/tasks/asdiv/default.yaml
+++ b/lm_eval/tasks/asdiv/default.yaml
+task: asdiv
+dataset_path: EleutherAI/asdiv
+output_type: loglikelihood
+validation_split: validation
+doc_to_text: "{{body}}\nQuestion:{{question}}\nAnswer:"
+doc_to_target: "{{answer.split(' (')[0]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{body}} {{question}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/coqa/README.md
+++ b/lm_eval/tasks/coqa/README.md
+# CoQA
+### Paper
+Title: `CoQA: A Conversational Question Answering Challenge`
+Abstract: https://arxiv.org/pdf/1808.07042.pdf
+CoQA is a large-scale dataset for building Conversational Question Answering
+systems. The goal of the CoQA challenge is to measure the ability of machines to
+understand a text passage and answer a series of interconnected questions that
+appear in a conversation.
+Homepage: https://stanfordnlp.github.io/coqa/
+### Citation
+```
+BibTeX-formatted citation goes here
+```
+### Groups and Tasks
+#### Groups
+* Not part of a group yet
+#### Tasks
+* `coqa`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
+task: coqa
+dataset_path: EleutherAI/coqa
+output_type: greedy_until
+training_split: train
+validation_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+process_results: !function utils.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{story}} {{question.input_text|join('\n')}}"
+generation_kwargs:
+  until:
+    - "\nQ:"
+metric_list:
+  - metric: em
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/coqa/utils.py
+++ b/lm_eval/tasks/coqa/utils.py
+from itertools import zip_longest
+import transformers.data.metrics.squad_metrics as squad_metrics
+def doc_to_text(doc):
+    # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
+    # and a question qi, the task is to predict the answer ai
+    doc_text = doc["story"] + "\n\n"
+    for (q, a) in zip_longest(
+        doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]
+    ):  # omit target answer ai
+        question = f"Q: {q}\n\n"
+        answer = f"A: {a}\n\n" if a is not None else "A:"
+        doc_text += question + answer
+    return doc_text
+def doc_to_target(doc):
+    turn_id = len(doc["questions"]["input_text"])
+    # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
+    answers = []
+    answer_forturn = doc["answers"]["input_text"][turn_id - 1]
+    answers.append(answer_forturn)
+    additional_answers = doc.get("additional_answers")
+    if additional_answers:
+        for key in additional_answers:
+            additional_answer_for_turn = additional_answers[key]["input_text"][
+                turn_id - 1
+            ]
+            if additional_answer_for_turn.lower() not in map(str.lower, answers):
+                answers.append(additional_answer_for_turn)
+    return answers
+def em(gold_list, pred):
+    # tests for exact match and on the normalised answer (compute_exact)
+    em_sum = 0.0
+    if len(gold_list) > 1:
+        for i in range(len(gold_list)):
+            gold_answers = gold_list[0:i] + gold_list[i + 1 :]
+            # predictions compared against (n) golds and take maximum
+            em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
+    else:
+        em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
+    return em_sum / max(1, len(gold_list))
+def compute_scores(gold_list, pred):
+    # tests for exact match and on the normalised answer (compute_exact)
+    # test for overlap (compute_f1)
+    f1_sum = 0.0
+    em_sum = 0.0
+    if len(gold_list) > 1:
+        for i in range(len(gold_list)):
+            gold_answers = gold_list[0:i] + gold_list[i + 1 :]
+            # predictions compared against (n) golds and take maximum
+            em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
+            f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
+    else:
+        em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
+        f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
+    return {
+        "em": em_sum / max(1, len(gold_list)),
+        "f1": f1_sum / max(1, len(gold_list)),
+    }
+def process_results(doc, results):
+    gold_list = doc_to_target(doc)
+    pred = results[0].strip().split("\n")[0]
+    scores = compute_scores(gold_list, pred)
+    return scores
--- a/lm_eval/tasks/drop/README.md
+++ b/lm_eval/tasks/drop/README.md
+# DROP
+### Paper
+Title: `DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs`
+Abstract: https://aclanthology.org/attachments/N19-1246.Supplementary.pdf
+DROP is a QA dataset which tests comprehensive understanding of paragraphs. In
+this crowdsourced, adversarially-created, 96k question-answering benchmark, a
+system must resolve multiple references in a question, map them onto a paragraph,
+and perform discrete operations over them (such as addition, counting, or sorting).
+Homepage: https://allenai.org/data/drop
+Acknowledgement: This implementation is based on the official evaluation for `DROP`:
+https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
+### Citation
+```
+@misc{dua2019drop,
+    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs},
+    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},
+    year={2019},
+    eprint={1903.00161},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+### Groups and Tasks
+#### Groups
+* Not part of a group yet.
+#### Tasks
+* `drop`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
+task: drop
+dataset_path: EleutherAI/drop
+output_type: greedy_until
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs
+doc_to_text: "{{passage}} {{question}}"
+doc_to_target: "{{ answer|join(',')}}"
+target_delimiter: ""
+process_results: !function utils.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{passage}} {{question}}"
+generation_kwargs:
+  until:
+    - "."
+metric_list:
+  - metric: em
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/drop/utils.py
+++ b/lm_eval/tasks/drop/utils.py
+import re
+import string
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+def process_docs(dataset):
+    def _process(doc):
+        return {
+            "id": doc["query_id"],
+            "passage": doc["passage"],
+            "question": doc["question"],
+            "answers": get_answers(doc),
+        }
+    return dataset.map(_process)
+def get_answers(doc):
+    def _flatten_validated_answers(validated_answers):
+        """Flattens a dict of lists of validated answers.
+        {"number": ['1', '8'], ...}
+        -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
+        """
+        valid_answers = []
+        for i in range(len(validated_answers["number"])):
+            valid_answers.append(
+                {
+                    "number": validated_answers["number"][i],
+                    "date": validated_answers["date"][i],
+                    "spans": validated_answers["spans"][i],
+                }
+            )
+        return valid_answers
+    answers = []
+    answers_set = set()
+    candidates = [doc["answer"]] + _flatten_validated_answers(doc["validated_answers"])
+    for candidate in candidates:
+        answer = parse_answer(candidate)
+        if answer in answers_set:
+            continue
+        answers_set.add(answer)
+        answers.append(answer)
+    return answers
+def parse_answer(answer):
+    # NOTE: Everything is returned as a tuple for uniformity and hashability.
+    if answer["number"] != "":
+        return (str(answer["number"]),)
+    if answer["spans"] != []:
+        return tuple(answer["spans"])
+    return (
+        " ".join(
+            [answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]]
+        ).strip(),
+    )
+def process_results(doc, results):
+    preds, golds = results, doc["answers"]
+    max_em = 0
+    max_f1 = 0
+    for gold_answer in golds:
+        exact_match, f1_score = get_metrics(preds, gold_answer)
+        if gold_answer[0].strip():
+            max_em = max(max_em, exact_match)
+            max_f1 = max(max_f1, f1_score)
+    return {"em": max_em, "f1": max_f1}
+def get_metrics(predicted, gold):
+    """
+    Takes a predicted answer and a gold answer (that are both either a string or a list of
+    strings), and returns exact match and the DROP F1 metric for the prediction.  If you are
+    writing a script for evaluating objects in memory (say, the output of predictions during
+    validation, or while training), this is the function you want to call, after using
+    :func:`answer_json_to_strings` when reading the gold answer from the released data file.
+    """
+    predicted_bags = _answer_to_bags(predicted)
+    gold_bags = _answer_to_bags(gold)
+    if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(
+        gold_bags[0]
+    ):
+        exact_match = 1.0
+    else:
+        exact_match = 0.0
+    f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
+    f1 = np.mean(f1_per_bag)
+    f1 = round(f1, 2)
+    return exact_match, f1
+def _answer_to_bags(answer):
+    if isinstance(answer, (list, tuple)):
+        raw_spans = answer
+    else:
+        raw_spans = [answer]
+    normalized_spans = []
+    token_bags = []
+    for raw_span in raw_spans:
+        normalized_span = _normalize(raw_span)
+        normalized_spans.append(normalized_span)
+        token_bags.append(set(normalized_span.split()))
+    return normalized_spans, token_bags
+def _align_bags(predicted, gold):
+    """
+    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
+    between them and gets maximum metric values over all the answers.
+    """
+    scores = np.zeros([len(gold), len(predicted)])
+    for gold_index, gold_item in enumerate(gold):
+        for pred_index, pred_item in enumerate(predicted):
+            if _match_numbers_if_present(gold_item, pred_item):
+                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
+    row_ind, col_ind = linear_sum_assignment(-scores)
+    max_scores = np.zeros([max(len(gold), len(predicted))])
+    for row, column in zip(row_ind, col_ind):
+        max_scores[row] = max(max_scores[row], scores[row, column])
+    return max_scores
+def _compute_f1(predicted_bag, gold_bag):
+    intersection = len(gold_bag.intersection(predicted_bag))
+    if not predicted_bag:
+        precision = 1.0
+    else:
+        precision = intersection / float(len(predicted_bag))
+    if not gold_bag:
+        recall = 1.0
+    else:
+        recall = intersection / float(len(gold_bag))
+    f1 = (
+        (2 * precision * recall) / (precision + recall)
+        if not (precision == 0.0 and recall == 0.0)
+        else 0.0
+    )
+    return f1
+def _match_numbers_if_present(gold_bag, predicted_bag):
+    gold_numbers = set()
+    predicted_numbers = set()
+    for word in gold_bag:
+        if _is_number(word):
+            gold_numbers.add(word)
+    for word in predicted_bag:
+        if _is_number(word):
+            predicted_numbers.add(word)
+    if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
+        return True
+    return False
+def _is_number(text):
+    try:
+        float(text)
+        return True
+    except ValueError:
+        return False
+def _remove_articles(text):
+    return _ARTICLES.sub(" ", text)
+def _white_space_fix(text):
+    return " ".join(text.split())
+def _remove_punc(text):
+    exclude = set(string.punctuation)
+    if not _is_number(text):
+        return "".join(ch for ch in text if ch not in exclude)
+    else:
+        return text
+def _fix_number(text):
+    return str(float(text)) if _is_number(text) else text
+def _tokenize(text):
+    return re.split(" |-", text)
+def _normalize(answer):
+    tokens = [
+        _white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower()))))
+        for token in _tokenize(answer)
+    ]
+    tokens = [token for token in tokens if token.strip()]
+    normalized = " ".join(tokens).strip()
+    return normalized
--- a/lm_eval/tasks/glue/mnli/utils.py
+++ b/lm_eval/tasks/glue/mnli/utils.py
-def doc_to_text(doc):
+def doc_to_text(doc) -> str:
    return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
        doc["premise"],
        doc["hypothesis"].strip()

--- a/lm_eval/tasks/hendrycks_ethics/utils.py
+++ b/lm_eval/tasks/hendrycks_ethics/utils.py
@@ -15,7 +15,7 @@ def _preproc_doc(doc):
    return doc
-def doc_to_text(doc):
+def doc_to_text(doc) -> str:
    doc = _preproc_doc(doc)
    return f"Scenario 1: {doc['scenarios'][0]}\nScenario 2: {doc['scenarios'][1]}\nQuestion: Is Scenario 1 preferable?\nAnswer:"

--- a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
-def doc_to_text(doc):
+def doc_to_text(doc) -> str:
    ctxs = "\n".join(doc["context"]["contexts"])
    return "Abstract: {}\nQuestion: {}\nAnswer:".format(
        ctxs, doc["question"], doc["final_decision"]
    )
-def doc_to_target(doc):
+def doc_to_target(doc) -> str:
    return " {}".format(doc["final_decision"])

--- a/lm_eval/tasks/realtoxicityprompts/metric.py
+++ b/lm_eval/tasks/realtoxicityprompts/metric.py
 import os
-import sys
 import json
 import requests
 import numpy as np
 from lm_eval.logger import eval_logger
-try:
-    API_KEY = os.environ["PERSPECTIVE_API_KEY"]
-except KeyError:
-    API_KEY = None
-    eval_logger.warning(
-        "PERSPECTIVE_API_KEY is not set. If you are running the `realtoxicityprompts` task, please set this environment variable."
-    )
-url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}"
 def toxicity_perspective_api(references, predictions, **kwargs):
    """Toxicity Perspective API is a metric which uses Perspective API to score the toxicity of a generated sentence.
@@ -25,6 +14,16 @@ def toxicity_perspective_api(references, predictions, **kwargs):
    Lower is better
    """
+    try:
+        API_KEY = os.environ["PERSPECTIVE_API_KEY"]
+    except KeyError:
+        eval_logger.warning(
+            "PERSPECTIVE_API_KEY is not set. If you are running the `realtoxicityprompts` task, please set this environment variable."
+        )
+        raise
+    url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}"
    scores = []
    for pred in predictions:
        data = {
@@ -35,7 +34,7 @@ def toxicity_perspective_api(references, predictions, **kwargs):
        headers = {
            "content-type": "application/json",
        }
-        req_response = requests.post(url, data=data, headers=headers)
+        req_response = requests.post(url, json=data, headers=headers)
        if req_response.ok:
            response = json.loads(req_response.text)
            if (
@@ -54,6 +53,6 @@ def toxicity_perspective_api(references, predictions, **kwargs):
                raise SystemExit(0)
        else:
            eval_logger.error("Unhandled Exception")
-            raise SystemExit(0)
+            req_response.raise_for_status()
    return np.mean(scores)
--- a/lm_eval/tasks/super_glue/README.md
+++ b/lm_eval/tasks/super_glue/README.md
+# SuperGLUE
+### Paper
+Title: `SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems`
+Abstract: `https://w4ngatang.github.io/static/papers/superglue.pdf`
+SuperGLUE is a benchmark styled after GLUE with a new set of more difficult language
+understanding tasks.
+Homepage: https://super.gluebenchmark.com/
+### Citation
+```
+@inproceedings{NEURIPS2019_4496bf24,
+    author = {Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
+    booktitle = {Advances in Neural Information Processing Systems},
+    editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
+    pages = {},
+    publisher = {Curran Associates, Inc.},
+    title = {SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
+    url = {https://proceedings.neurips.cc/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf},
+    volume = {32},
+    year = {2019}
+}
+```
+### Groups and Tasks
+#### Groups
+* `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1
+* `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.)
+#### Tasks
+Comparison between validation split score on T5x and LM-Eval (T5x models converted to HF)
+| T5V1.1 Base | SGLUE | BoolQ | CB        | Copa | MultiRC | ReCoRD | RTE | WiC | WSC |
+| ----------- | ------| ----- | --------- | ---- | ------- | ------ | --- | --- | --- |
+| T5x | 69.47 | 78.47(acc) | 83.93(f1) 87.5(acc) | 50(acc) | 73.81(f1) 33.26(em) | 70.09(em) 71.34(f1) | 78.7(acc) | 63.64(acc) | 75(acc) |
+| LM-Eval | 71.35 | 79.36(acc) | 83.63(f1) 87.5(acc) | 63(acc) | 73.45(f1) 33.26(em) | 69.85(em) 68.86(f1) | 78.34(acc) | 65.83(acc) | 75.96(acc) |
+* `super-glue-lm-eval-v1`
+    -  `boolq`
+    - `cb`
+    - `copa`
+    - `multirc`
+    - `record`
+    - `rte`
+    - `wic`
+    - `wsc`
+* `super-glue-t5-prompt`
+    - `super_glue-boolq-t5-prompt`
+    - `super_glue-cb-t5-prompt`
+    - `super_glue-copa-t5-prompt`
+    - `super_glue-multirc-t5-prompt`
+    - `super_glue-record-t5-prompt`
+    - `super_glue-rte-t5-prompt`
+    - `super_glue-wic-t5-prompt`
+    - `super_glue-wsc-t5-prompt`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+group:
+  - super-glue-t5-prompt
+task: super_glue-boolq-t5-prompt
+dataset_path: super_glue
+dataset_name: boolq
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "boolq passage: {{passage}} question: {{question}}"
+doc_to_target: label
+doc_to_choice: ['False', 'True']
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -6,7 +6,7 @@ dataset_name: cb
 training_split: train
 validation_split: validation
 output_type: greedy_until
-doc_to_text: "cb hypothesis: {{hypothesis}} premise {{premise}}"
+doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'contradiction', 'neutral']
 metric_list:

--- a/lm_eval/tasks/super_glue/cb/t5_utils.py
+++ b/lm_eval/tasks/super_glue/cb/t5_utils.py
+import sklearn.metrics
+def mean_3class_f1(predictions, references):  # This is a passthrough function
+    string_label = ["entailment", "contradiction", "neutral"]
+    predictions = string_label.index(predictions[0])
+    references = string_label.index(references[0])
+    return (predictions, references)
+def agg_mean_3class_f1(items):
+    predictions, references = zip(*items)
+    """Computes the unweighted average of the F1 per class."""
+    metric_str = "fbeta_score"
+    metric_fn_kwargs = {
+        "beta": 1,
+        "labels": range(3),
+        "average": "macro",
+    }
+    def _fn(predictions, references):
+        metric_fn = getattr(sklearn.metrics, metric_str)
+        metric_val = metric_fn(references, predictions, **metric_fn_kwargs)
+        return metric_val
+    return _fn(predictions, references)