in-place replace main with lm-eval2, keeping old git history

d2a9b759 · haileyschoelkopf · 814940e8 · d2a9b759 · d2a9b759 · 814940e8
Commit d2a9b759 authored Apr 19, 2023 by haileyschoelkopf
20 changed files
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
+# TODO: decide whether we want jinja2 or f-string prompts. would it be cursed to support both?
+# Prompt library. 
+# Stores prompts in a dictionary indexed by 2 levels:
+# prompt category name, and prompt name.
+# This allows us to access prompts
+PROMPT_REGISTRY = {
+    "qa-basic": {
+        "question-newline-answer": "Question: {{question}}\nAnswer:",
+        "q-newline-a": "Q: {question}\nA:"
+    },
+}
+
+def get_prompt(prompt_id: str):
+    # unpack prompt name 
+    try:
+        category_name, prompt_name = prompt_id.split(":")
+    except:
+        raise ValueError(
+            f"expected only a single `:` as separator between \
+prompt category and name, but got `{prompt_id}` instead"
+            )
+    return PROMPT_REGISTRY[category_name][prompt_name]
\ No newline at end of file
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
-"""
-Adversarial NLI: A New Benchmark for Natural Language Understanding
-https://arxiv.org/pdf/1910.14599.pdf
-
-Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
-human-and-model-in-the-loop procedure. It consists of three rounds that progressively
-increase in difficulty and complexity, and each question-answer includes annotator-
-provided explanations.
-
-Homepage: "https://github.com/facebookresearch/anli"
-"""
-import numpy as np
-from lm_eval.base import rf, Task
-from lm_eval.metrics import mean
-
-
-_CITATION = """
-@inproceedings{nie-etal-2020-adversarial,
-    title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
-    author = "Nie, Yixin  and
-      Williams, Adina  and
-      Dinan, Emily  and
-      Bansal, Mohit  and
-      Weston, Jason  and
-      Kiela, Douwe",
-    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
-    year = "2020",
-    publisher = "Association for Computational Linguistics",
-}
-"""
-
-
-class ANLIBase(Task):
-    VERSION = 0
-    DATASET_PATH = "anli"
-    DATASET_NAME = None
-    SPLIT = None
-
-    def has_training_docs(self):
-        return True
-
-    def has_validation_docs(self):
-        return True
-
-    def has_test_docs(self):
-        return True
-
-    def training_docs(self):
-        if self.has_training_docs():
-            if self._training_docs is None:
-                self._training_docs = list(self.dataset["train_r" + str(self.SPLIT)])
-            return self._training_docs
-
-    def validation_docs(self):
-        if self.has_validation_docs():
-            return self.dataset["dev_r" + str(self.SPLIT)]
-
-    def test_docs(self):
-        if self.has_test_docs():
-            return self.dataset["test_r" + str(self.SPLIT)]
-
-    def doc_to_text(self, doc):
-        # OA does this a bit weirdly: they prepend "anli 1:  anli 1:  " to the beginning
-        # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly
-        # appended onto the question, with no "Answer:" or even a newline. Do we *really*
-        # want to do it exactly as OA did?
-        return (
-            doc["premise"]
-            + "\nQuestion: "
-            + doc["hypothesis"]
-            + " True, False, or Neither?\nAnswer:"
-        )
-
-    def should_decontaminate(self):
-        return True
-
-    def doc_to_decontamination_query(self, doc):
-        return doc["premise"]
-
-    def doc_to_target(self, doc):
-        # True = entailment
-        # False = contradiction
-        # Neither = neutral
-        return " " + ["True", "Neither", "False"][doc["label"]]
-
-    def construct_requests(self, doc, ctx):
-        """Uses RequestFactory to construct Requests and returns an iterable of
-        Requests which will be sent to the LM.
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-            The context string, generated by fewshot_context. This includes the natural
-            language description, as well as the few shot examples, and the question
-            part of the document for `doc`.
-        """
-        ll_true, _ = rf.loglikelihood(ctx, " True")
-        ll_neither, _ = rf.loglikelihood(ctx, " Neither")
-        ll_false, _ = rf.loglikelihood(ctx, " False")
-        return ll_true, ll_neither, ll_false
-
-    def process_results(self, doc, results):
-        """Take a single document and the LM results and evaluates, returning a
-        dict where keys are the names of submetrics and values are the values of
-        the metric for that one document
-
-        :param doc:
-            The document as returned from training_docs, validation_docs, or test_docs.
-        :param results:
-            The results of the requests created in construct_requests.
-        """
-        gold = doc["label"]
-        pred = np.argmax(results)
-        return {"acc": pred == gold}
-
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metrics
-        """
-        return {"acc": mean}
-
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
-            whether a higher value of the submetric is better
-        """
-        return {"acc": True}
-
-
-class ANLIRound1(ANLIBase):
-    SPLIT = 1
-
-
-class ANLIRound2(ANLIBase):
-    SPLIT = 2
-
-
-class ANLIRound3(ANLIBase):
-    SPLIT = 3
--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
@@ -12,7 +12,10 @@ a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questi

 Homepage: https://allenai.org/data/arc
 """
-from lm_eval.base import MultipleChoiceTask
+from lm_eval.api.task import MultipleChoiceTask
+from lm_eval.prompts import get_prompt
+
+from lm_eval import utils


 _CITATION = """
@@ -27,10 +30,12 @@ _CITATION = """


 class ARCEasy(MultipleChoiceTask):
-    VERSION = 0
+    VERSION = "2.0"
    DATASET_PATH = "ai2_arc"
    DATASET_NAME = "ARC-Easy"

+    OUTPUT_TYPE = "loglikelihood"
+
    def has_training_docs(self):
        return True

@@ -58,14 +63,15 @@ class ARCEasy(MultipleChoiceTask):
        doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
        out_doc = {
            "id": doc["id"],
-            "query": "Question: " + doc["question"] + "\nAnswer:",
+            "question": doc["question"],
            "choices": doc["choices"]["text"],
            "gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
        }
        return out_doc

    def doc_to_text(self, doc):
-        return doc["query"]
+        doc_to_text = get_prompt("qa-basic:question-newline-answer")
+        return utils.apply_template(doc_to_text, doc)

    def should_decontaminate(self):
        return True

--- a/lm_eval/tasks/arc.yaml
+++ b/lm_eval/tasks/arc.yaml
+dataset_path: ai2_arc
+dataset_name: ARC-Challenge
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Q: {{question}}\nA:"
+doc_to_target: "{% set answer_choices = doc['choices']['text'] %}{{answer_choices[int(doc['answerKey']) - 1]}}"
+metric_list: [
+  [exact_match, mean, true]
+  ]
--- a/lm_eval/tasks/arithmetic.py
+++ b/lm_eval/tasks/arithmetic.py
--- a/lm_eval/tasks/asdiv.py
+++ b/lm_eval/tasks/asdiv.py
--- a/lm_eval/tasks/blimp.py
+++ b/lm_eval/tasks/blimp.py
--- a/lm_eval/tasks/cbt.py
+++ b/lm_eval/tasks/cbt.py
--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
--- a/lm_eval/tasks/crowspairs.py
+++ b/lm_eval/tasks/crowspairs.py
--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
--- a/lm_eval/tasks/gsm8k.py
+++ b/lm_eval/tasks/gsm8k.py
--- a/lm_eval/tasks/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k.yaml
--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
--- a/lm_eval/tasks/hendrycks_ethics.py
+++ b/lm_eval/tasks/hendrycks_ethics.py
--- a/lm_eval/tasks/hendrycks_math.py
+++ b/lm_eval/tasks/hendrycks_math.py
--- a/lm_eval/tasks/hendrycks_test.py
+++ b/lm_eval/tasks/hendrycks_test.py