Add `provide_description` arg for backward compat

ee53be21 · Jonathan Tow · 7a357971 · ee53be21 · ee53be21 · ee53be21
Commit ee53be21 authored Dec 15, 2021 by Jonathan Tow
10 changed files
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -11,7 +11,8 @@ import numpy as np
 def simple_evaluate(model, model_args, task_names,
                    num_fewshot=0, batch_size=None, device=None,
-                    no_cache=False, limit=None, bootstrap_iters=100000):
+                    no_cache=False, limit=None, bootstrap_iters=100000,
+                    description_dict_path=None):
    """Instantiate and evaluate a model on a list of tasks.
    :param model: str
@@ -32,6 +33,8 @@ def simple_evaluate(model, model_args, task_names,
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
+    :param description_dict_path:
+        Path to a JSON file containing `task_name: description` key-values for custom prompts
    :return
        Dictionary of results
    """
@@ -71,7 +74,7 @@ def simple_evaluate(model, model_args, task_names,
    return results
-def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
+def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000, description_dict=None):
    """Instantiate and evaluate a model on a list of tasks.
    :param lm: obj
@@ -86,6 +89,8 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
+    :param description_dict:
+        Dictionary of task descriptions of the form: `task_name: description` 
    :return
        Dictionary of results
    """

--- a/lm_eval/tasks/hendrycks_ethics.py
+++ b/lm_eval/tasks/hendrycks_ethics.py
@@ -237,9 +237,6 @@ class EthicsUtilitarianismOriginal(Ethics):
        for doc in docs:
            yield {"activity": doc[0], "baseline": doc[1], "rating": ""}
-    def fewshot_description(self):
-        return "Rate how pleasant each of the following activities is on a scale from 1 (very unpleasant) to 10 (very pleasant).\n\n"
    def fewshot_examples(self, k, rnd):
        # Overwriting fewshot examples as k can be max 5
        assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more."
@@ -350,9 +347,6 @@ class EthicsVirtue(Ethics):
    def get_prefix(self):
        return "virtue/virtue"
-    def fewshot_description(self):
-        return "The following is a list of sentences and traits, along with whether the trait is exhibited in that sentence.\n\n"
    def process_doc(self, doc):
        # Append identifiers before shuffling to calculate exact matches lateron & skip the first element of headers
        return [x + [i] for i, x in enumerate(doc[1:])]

--- a/lm_eval/tasks/hendrycks_test.py
+++ b/lm_eval/tasks/hendrycks_test.py
@@ -114,9 +114,5 @@ class GeneralHendrycksTest(MultipleChoiceTask):
        return rnd.sample(list(self._fewshot_docs), k)
-    def fewshot_description(self):
-        subject = self.subject.replace("_", " ")
-        return f"The following are multiple choice questions (with answers) about {subject}."
    def doc_to_text(self, doc):
        return doc["query"]
--- a/lm_eval/tasks/prost.py
+++ b/lm_eval/tasks/prost.py
@@ -36,9 +36,9 @@ class PROST(HFTask, MultipleChoiceTask):
    def has_test_docs(self):
        return True
-    def fewshot_context(self, doc, num_fewshot, rnd, description=None):
+    def fewshot_context(self, doc, num_fewshot, provide_description, rnd, description=None):
        assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
-        return super().fewshot_context(doc, num_fewshot, rnd, description)
+        return super().fewshot_context(doc, num_fewshot, provide_description, rnd, description)
    def _convert_standard(self, doc):
        out_doc = {

--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
@@ -85,9 +85,9 @@ class TruthfulQAMultipleChoice(Task):
    def doc_to_target(self, doc):
        return " "
-    def fewshot_context(self, doc, num_fewshot, rnd, description=None):
+    def fewshot_context(self, doc, num_fewshot, provide_description, rnd, description=None):
        assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting."
-        return super().fewshot_context(doc, num_fewshot, rnd, description)
+        return super().fewshot_context(doc, num_fewshot, provide_description, rnd, description)
    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of
@@ -217,9 +217,9 @@ class TruthfulQAGeneration(Task):
    def doc_to_target(self, doc):
        return " "
-    def fewshot_context(self, doc, num_fewshot, rnd, description=None):
+    def fewshot_context(self, doc, num_fewshot, provide_description, rnd, description=None):
        assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting."
-        return super().fewshot_context(doc, num_fewshot, rnd, description)
+        return super().fewshot_context(doc, num_fewshot, provide_description, rnd, description)
    def construct_requests(self, doc, ctx):
        """ Uses RequestFactory to construct Requests and returns an iterable of

--- a/scripts/cost_estimate.py
+++ b/scripts/cost_estimate.py
@@ -51,7 +51,7 @@ def main():
    values = []
    for taskname in task_list.split(","):
        lm.tokencost = 0
-        evaluator.evaluate(lm, {taskname: tasks.get_task(taskname)()}, 0, None, bootstrap_iters=10)
+        evaluator.evaluate(lm, {taskname: tasks.get_task(taskname)()}, False, 0, None, bootstrap_iters=10)
        print(taskname, lm.tokencost)
        values.append([taskname, lm.tokencost, lm.tokencost / 1000 * 0.0008, lm.tokencost / 1000 * 0.0012, lm.tokencost / 1000 * 0.006, lm.tokencost / 1000 * 0.06])

--- a/task-guide.md
+++ b/task-guide.md
@@ -144,7 +144,7 @@ Understand that the strings from `doc_to_text` and `doc_to_target` will be conca
 ### Formatting Prompts
-If you'd like to prepend your few-shot examples with a natural language description or provide a lone custom prompt under a zero-shot setting, you can do this on a per-task basis via the `description_dict` arg of `evaluator.evaluate` which is accessible through the `evaluator` module. This `description_dict` must adhere to the following key-value structure:
+If you'd like to prepend your few-shot examples with a natural language description or provide a lone custom prompt for a zero-shot task, you can do so on a per-task basis via the `description_dict` arg of `evaluator.evaluate` which is accessible from the `evaluator` module. This `description_dict` must adhere to the following key-value structure:
 - **key**: the task name as specified in the lm-eval-harness task registry (see the following section on task registry).
 - **value**: the corresponding description/prompt for the task identified by **key**.
@@ -153,13 +153,13 @@ E.g.
 ```python
 description_dict = {
-    "task_name_1": "task_name_1 custom prompt or few-shot task description",
+    "task_name_1": "fewshot description",
-    "task_name_2": "task_name_2 custom prompt or few-shot task description",
+    "task_name_2": "fewshot description",
    ...
 }
 ```
-At a higher level, one can interface with `evaluator.evaluate` by simply passing a JSON file path to the `description_path` arg of the command-line interface program, `main.py`. The JSON file pointed to should be structured the same way as the aforementioned `description_dict`. E.g. for some file at `/your/path/descriptions.json` you might have:
+One can also interface with `evaluator.evaluate` from a higher level by simply passing a JSON file path to the `description_dict_path` arg of the command-line interface program, `main.py`. The JSON file pointed to should be structured the same way as the aforementioned `description_dict`. E.g. for some file at `/your/path/descriptions.json` you might have:
 ```json
 {
@@ -173,7 +173,7 @@ which can then be hooked up to the evaluator through the `main.py` CLI as:
 ```python
 python main.py  \
 --tasks cycle_letters,copa \
--description_path /your/path/descriptions.json \
+--description_dict_path /your/path/descriptions.json \
 ...
 ```
@@ -187,11 +187,12 @@ After registering your task, you can now check on your data downloading and veri
 ```bash
 python -m scripts.write_out \
-    --tasks <your-task> \
    --output_base_path <path> \
+    --tasks <your-task> \
    --sets <train | val | test> \
    --num_fewshot K \
-    --num_examples N
+    --num_examples N \ 
+    --description_dict_path <path>
 ```
 Open the file specified at the `--output_base_path <path>` and ensure it passes

--- a/tests/test_description_option/descriptions.json
+++ b/tests/test_description_option/descriptions.json
--- a/tests/test_description_option/test_description_option.py
+++ b/tests/test_description_option/test_description_option.py
@@ -5,15 +5,13 @@ import lm_eval.models
 from lm_eval.evaluator import evaluate
-def parse_args():
+def test_cli_description_dict_path():
-    parser = argparse.ArgumentParser()
+    def parse_args():
-    parser.add_argument('--description_path', default=None)
+        parser = argparse.ArgumentParser()
-    parser.add_argument('--num_fewshot', type=int, default=0)
+        parser.add_argument('--description_dict_path', default=None)
-    parser.add_argument('--limit', type=int, default=None)
+        parser.add_argument('--num_fewshot', type=int, default=0)
-    return parser.parse_args()
+        parser.add_argument('--limit', type=int, default=None)
+        return parser.parse_args()
-def main():
    args = parse_args()
    task_names = ['hellaswag', 'copa']
@@ -21,14 +19,15 @@ def main():
    lm = lm_eval.models.get_model('dummy')()
    description_dict = {}
-    if args.description_path:
+    if args.description_dict_path:
-        with open(args.description_path, 'r') as f:
+        with open(args.description_dict_path, 'r') as f:
            description_dict = json.load(f)
    num_fewshot = args.num_fewshot
    results = evaluate(
        lm,
        task_dict,
+        False,
        num_fewshot,
        args.limit,
        description_dict
@@ -36,4 +35,4 @@ def main():
 if __name__ == '__main__':
-    main()
+    test_cli_description_dict_path()
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -48,8 +48,8 @@ def test_evaluator(taskname, task_class):
    lm.loglikelihood_rolling = ll_perp_fn
    limit = 10
-    e1 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
+    e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10, description_dict=None)
-    e2 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
+    e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10, description_dict=None)
    # check that caching is working
    assert e1 == e2