Replace the `fewshot_description` API with a `description_dict` based interface

8ac99269 · Jonathan Tow · b67aec37 · 8ac99269 · 8ac99269 · 8ac99269
Commit 8ac99269 authored Oct 30, 2021 by Jonathan Tow
20 changed files
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -2,6 +2,7 @@ import abc
 import random
 import numpy as np
 import re
+from lm_eval import tasks

 from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean

@@ -224,11 +225,15 @@ class Task(abc.ABC):
        pass

    def fewshot_description(self):
+        import warnings
+        warnings.warn(
+            "`fewshot_description` will be removed in coming versions. Pass " \
+            "any custom descriptions to the `evaluate` function instead.",
+            DeprecationWarning)
        return ""

-    def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
-        raw_description = self.fewshot_description()
-        description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
+    def fewshot_context(self, doc, num_fewshot, rnd, description=None):
+        description = description + "\n\n" if description else ""

        if num_fewshot == 0:
            labeled_examples = ""
@@ -295,16 +300,13 @@ class PerplexityTask(Task, abc.ABC):
    def has_training_docs(self):
        return False

-    def fewshot_description(self):
-        return ""
-
    def fewshot_examples(self, k, rnd):
        assert k == 0
        return []

-    def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
+    def fewshot_context(self, doc, num_fewshot, rnd, description=None):
        assert num_fewshot == 0
-        assert not provide_description
+        assert description is None 
        return ""

    def higher_is_better(self):

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
 import collections
 import itertools
+import json
 import random
 import lm_eval.metrics
 import lm_eval.models
@@ -7,7 +8,7 @@ import lm_eval.tasks
 import lm_eval.base
 import numpy as np

-def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=None, device=None, no_cache=False, limit=None, bootstrap_iters=100000):
+def simple_evaluate(model, model_args, task_names, description_path=None, num_fewshot=0, batch_size=None, device=None, no_cache=False, limit=None, bootstrap_iters=100000):
    random.seed(1234)
    np.random.seed(1234)

@@ -19,7 +20,12 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
        lm = lm_eval.base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db')
    
    task_dict = lm_eval.tasks.get_task_dict(task_names)
-    results = evaluate(lm, task_dict, False, num_fewshot, limit)
+    description_dict = {}
+    if description_path:
+        with open(description_path, 'r') as f:
+            description_dict = json.load(f)
+
+    results = evaluate(lm, task_dict, num_fewshot, limit, description_dict)

    # add info about the model and few shot config
    results["config"] = {
@@ -28,6 +34,8 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
        "num_fewshot": num_fewshot,
        "batch_size": batch_size,
        "device": device,
+        # TODO (jon-tow): Should we add the description info to `results["config"]`?
+        # "description_dict": description_dict,
        "no_cache": no_cache,
        "limit": limit,
        "bootstrap_iters": bootstrap_iters
@@ -36,9 +44,7 @@ def simple_evaluate(model, model_args, task_names, num_fewshot=0, batch_size=Non
    return results


-def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
-    assert not provide_description # not implemented. todo: implement proper description-providing system
-
+def evaluate(lm, task_dict, num_fewshot, limit, description_dict=None, bootstrap_iters=100000):
    # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces

    task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
@@ -73,16 +79,16 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
        rnd.seed(42)
        rnd.shuffle(task_docs)

+        description = description_dict[task_name] if description_dict and task_name in description_dict else ""
+
        for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
            docs[(task_name, doc_id)] = doc
-
            ctx = task.fewshot_context(
                doc=doc,
-                provide_description=provide_description,
                num_fewshot=num_fewshot,
-                rnd=rnd
+                rnd=rnd,
+                description=description
            )
-
            reqs = task.construct_requests(doc, ctx)
            if not isinstance(reqs, (list, tuple)): reqs = [reqs]
            for i, req in enumerate(reqs):
@@ -168,4 +174,4 @@ def make_table(result_dict):
    # todo: make latex table look good
    # print(latex_writer.dumps())

-    return md_writer.dumps()
\ No newline at end of file
+    return md_writer.dumps()
--- a/lm_eval/tasks/anli.py
+++ b/lm_eval/tasks/anli.py
@@ -33,10 +33,6 @@ class ANLIBase(HFTask):
        if self.has_test_docs():
            return self.data["test_r" + str(self.SPLIT)]

-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
-
    def doc_to_text(self, doc):
        # OA does this a bit weirdly: they prepend "anli 1:  anli 1:  " to the beginning
        # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly 

--- a/lm_eval/tasks/arc.py
+++ b/lm_eval/tasks/arc.py
@@ -29,10 +29,6 @@ class ARCEasy(HFTask, MultipleChoiceTask):
        }
        return out_doc

-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
-
    def doc_to_text(self, doc):
        return doc["query"]


--- a/lm_eval/tasks/cbt.py
+++ b/lm_eval/tasks/cbt.py
@@ -17,10 +17,6 @@ class CBTBase(HFTask):

    VERSION = 0

-    def fewshot_description(self):
-        # TODO: Figure out description.
-        return ""
-
    def detokenize(self, text):
        text = text.replace(" '", "'")
        text = text.replace(" \n", "\n")

--- a/lm_eval/tasks/coqa.py
+++ b/lm_eval/tasks/coqa.py
@@ -36,10 +36,7 @@ class CoQA(Task):

    def test_docs(self):
        pass
-    
-    def fewshot_description(self):
-        return "Given a passage and a conversation so far, answer the next question in the conversation."
-    
+
    def doc_to_text(self, doc):
        # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1} 
        # and a question qi, the task is to predict the answer ai

--- a/lm_eval/tasks/drop.py
+++ b/lm_eval/tasks/drop.py
@@ -40,10 +40,6 @@ class DROP(Task):
    def has_test_docs(self):
        return False

-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
-
    def _load_docs(self, docs):
        for doc in docs:
            for qa in doc["qa_pairs"]:

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -21,10 +21,6 @@ class CoLA(HFTask):
    def has_test_docs(self):
        return False

-    def fewshot_description(self):
-        # TODO
-        return ""
-
    def doc_to_text(self, doc):
        return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])

@@ -69,9 +65,6 @@ class SST(HFTask):
    def has_test_docs(self):
        return False

-    def fewshot_description(self):
-        return "Indicate if the sentiment of each sentence is positive or negative."
-
    def doc_to_text(self, doc):
        return "{}\nQuestion: Is this sentence positive or negative?\nAnswer:".format(
            general_detokenize(doc["sentence"]),
@@ -342,9 +335,6 @@ class MRPC(HFTask):
    def has_test_docs(self):
        return False

-    def fewshot_description(self):
-        return "Indicate if both sentences mean the same thing."
-
    def doc_to_text(self, doc):
        return "Sentence 1: {}\nSentence 2: {}\nQuestion: Do both sentences mean the same thing?\nAnswer:".format(
            general_detokenize(doc["sentence1"]),
@@ -395,9 +385,6 @@ class QQP(HFTask):
    def has_test_docs(self):
        return False

-    def fewshot_description(self):
-        return "Indicate if both questions ask the same thing."
-
    def doc_to_text(self, doc):
        return "Question 1: {}\nQuestion 2: {}\nQuestion: Do both questions ask the same thing?\nAnswer:".format(
            doc["question1"],
@@ -448,10 +435,6 @@ class STSB(HFTask):
    def has_test_docs(self):
        return True

-    def fewshot_description(self):
-        return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
-           "where 5 means identical and 0 means unrelated."
-
    def doc_to_text(self, doc):
        return "sentence 1: {}\nsentence 2: {}\nAnswer:".format(
            doc["sentence1"],

--- a/lm_eval/tasks/headqa.py
+++ b/lm_eval/tasks/headqa.py
@@ -25,9 +25,5 @@ class HeadQA(HFTask, MultipleChoiceTask):
        }
        return out_doc

-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
-
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/hellaswag.py
+++ b/lm_eval/tasks/hellaswag.py
@@ -35,10 +35,5 @@ class HellaSwag(HFTask, MultipleChoiceTask):
        }
        return out_doc

-    def fewshot_description(self):
-        return "Label for the relevant action: Sentences describing the " \
-            "context, with an incomplete sentence trailing\nanswer that " \
-            "plausibly completes the situation."
-
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/hendrycks_math.py
+++ b/lm_eval/tasks/hendrycks_math.py
@@ -55,9 +55,6 @@ class Math(Task):
    def test_docs(self):
        return self._load_docs(self.DATASET_PATH / "test" / self.get_file_info())

-    def fewshot_description(self):
-        return "Given a mathematics problem, determine the answer. Simplify your answer as much as possible."
-
    def doc_to_text(self, doc):
        return "Problem: " + doc["problem"] + "\nAnswer:"


--- a/lm_eval/tasks/lambada.py
+++ b/lm_eval/tasks/lambada.py
@@ -47,10 +47,6 @@ class LAMBADA(Task):

    def doc_to_target(self, doc):
        return " " + doc['text'].rsplit(' ', 1)[1]
-    
-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""

    def construct_requests(self, doc, ctx):
        ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))

--- a/lm_eval/tasks/lambada_cloze.py
+++ b/lm_eval/tasks/lambada_cloze.py
@@ -13,6 +13,3 @@ class LAMBADA_cloze(LAMBADA):

    def doc_to_target(self, doc):
        return " " + doc['text'].rsplit(' ', 1)[1]
-    
-    def fewshot_description(self):
-        return "Fill in blank:\n"
--- a/lm_eval/tasks/logiqa.py
+++ b/lm_eval/tasks/logiqa.py
@@ -80,9 +80,5 @@ class LogiQA(MultipleChoiceTask):
    def test_docs(self):
        return self._load_docs(self.DATASET_PATH / "Test.txt")

-    def fewshot_description(self):
-        # TODO: figure out actual description
-        return ""
-
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/mathqa.py
+++ b/lm_eval/tasks/mathqa.py
@@ -29,9 +29,5 @@ class MathQA(HFTask, MultipleChoiceTask):
        }
        return out_doc

-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
-
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/mc_taco.py
+++ b/lm_eval/tasks/mc_taco.py
@@ -39,9 +39,6 @@ class MCTACO(HFTask):
    def has_test_docs(self):
        return True

-    def fewshot_description(self):
-        return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
-
    def doc_to_text(self, doc):
        return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
            f"Answer: {doc['answer']}\nPlausible:"

--- a/lm_eval/tasks/mutual.py
+++ b/lm_eval/tasks/mutual.py
@@ -70,10 +70,6 @@ class MuTualBase(Task):
    def test_docs(self):
        return NotImplemented

-    def fewshot_description(self):
-        # TODO: figure out fewshot description
-        return ""
-
    def doc_to_text(self, doc):
        return self.detokenize(doc["article"])


--- a/lm_eval/tasks/naturalqs.py
+++ b/lm_eval/tasks/naturalqs.py
@@ -21,10 +21,6 @@ class NaturalQs(HFTask):
    def has_test_docs(self):
        return False

-    def fewshot_description(self):
-        # TODO: figure out description
-        return ""
-
    def training_docs(self):
        # Cache training for faster few-shot.
        # Data is too large to fit in memory.

--- a/lm_eval/tasks/openbookqa.py
+++ b/lm_eval/tasks/openbookqa.py
@@ -25,9 +25,5 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
        }
        return out_doc

-    def fewshot_description(self):
-        # TODO: figure out fewshot description
-        return ""
-
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/piqa.py
+++ b/lm_eval/tasks/piqa.py
@@ -18,10 +18,6 @@ class PiQA(HFTask, MultipleChoiceTask):
    def has_test_docs(self):
        return False

-    def fewshot_description(self):
-        # TODO: figure out fewshot description
-        return ""
-
    def _convert_standard(self, doc):
        out_doc = {
            "goal": doc["goal"],