Replace the `fewshot_description` API with a `description_dict` based interface

8ac99269 · Jonathan Tow · b67aec37 · b67aec37 · 8ac99269 · 8ac99269
Commit 8ac99269 authored Oct 30, 2021 by Jonathan Tow
8 changed files
--- a/scripts/fewshot_description_experiment.py
+++ b/scripts/fewshot_description_experiment.py
-import json
-import numpy as np
-import random
-import logging
-from lm_eval import models, tasks, evaluator, base
-logging.getLogger("openai").setLevel(logging.WARNING)
-fewshot_descriptions = [
-    "foo",
-    "bar"
-]
-task = "lambada"
-num_fewshot = 0
-model = "gpt2"
-model_args = ""
-limit = None
-no_cache = False
-class CustomDescTask:
-    def __init__(self, task, desc):
-        self.task = task
-        self.desc = desc
-        def fewshot_description():
-            return self.desc
-        self.task.fewshot_description = fewshot_description
-    def __getattr__(self, attr):
-        return getattr(self.task, attr)
-def main():
-    random.seed(42)
-    np.random.seed(42)
-    lm = models.get_model(model).create_from_arg_string(model_args)
-    if limit:
-        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
-    if not no_cache:
-        lm = base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_') + '.db')
-    task_dict = tasks.get_task_dict([task])
-    for desc in fewshot_descriptions:
-        custom_task_dict = {k: CustomDescTask(v, desc) for k, v in task_dict.items()}
-        results = evaluator.evaluate(lm, custom_task_dict, True, num_fewshot, limit)
-        dumped = json.dumps(results, indent=2)
-        print('Description:', desc)
-        print(dumped)
-        # MAKE TABLE
-        from pytablewriter import MarkdownTableWriter
-        writer = MarkdownTableWriter()
-        writer.headers = ["Task", "Metric", "Value"]
-        values = []
-        for k, dic in results.items():
-            for m, v in dic.items():
-                values.append([k, m, '%.4f' % v])
-                k = ""
-        writer.value_matrix = values
-        print(writer.dumps())
-if __name__ == "__main__":
-    main()
--- a/scripts/get_prompts.py
+++ b/scripts/get_prompts.py
@@ -9,7 +9,6 @@ for tname, Task in tasks.TASK_REGISTRY.items():#[('record', tasks.superglue.ReCo
    print('#', tname)
    docs = islice(task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct)
    print()
-    print('**Zero-Shot Prompt**:', "\n```\n" + task.fewshot_description() + "\n```\n")
    for i in range(ct):
        print()
        doc = next(docs)

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
 import argparse
 import numpy as np
+import json
 import os
 import random
 from lm_eval import tasks
@@ -12,7 +13,7 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_base_path', required=True)
    parser.add_argument('--tasks', default="all_tasks")
-    parser.add_argument('--provide_description', action="store_true")
+    parser.add_argument('--description_path', default=None)
    parser.add_argument('--sets', type=str, default="val") # example: val,test
    parser.add_argument('--num_fewshot', type=int, default=1)
    parser.add_argument('--seed', type=int, default=42)
@@ -29,6 +30,12 @@ def main():
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)
+    description_dict = {}
+    if args.description_path:
+        with open(args.description_path, 'r') as f:
+            description_dict = json.load(f)
    os.makedirs(args.output_base_path, exist_ok=True)
    for task_name, task in task_dict.items():
        rnd = random.Random()
@@ -47,14 +54,16 @@ def main():
        docs = join_iters(iters)
+        description = description_dict[task_name] if description_dict and task_name in description_dict else ""
        with open(os.path.join(args.output_base_path, task_name), "w") as f:
            for i, doc in zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs):
                f.write(EXAMPLE_DIVIDER.format(i=i))
                ctx = task.fewshot_context(
                    doc=doc,
-                    provide_description=args.provide_description,
                    num_fewshot=args.num_fewshot,
-                    rnd=rnd
+                    rnd=rnd,
+                    description=description
                )
                f.write(ctx + "\n")

--- a/task-guide.md
+++ b/task-guide.md
@@ -87,8 +87,7 @@ There are 2 standard approaches we follow for downloading data:
    ```
   These methods return `True`/`False` whether or not your task dataset provides documents for each split type. __Note__: if the test set doesn't have publicly available labels, please do not put it down as having a test set.
-	Lastly, we need to load the documents. In our terminology, a document (`doc`) is a single natural language data example stored in a Python `dict`. E.g.:
+	Lastly, we need to load the documents. In our terminology, a document (`doc`) is a single natural language data example stored in a Python `dict`. E.g.: `{“question”: “What is the capital of France?”, “answer”: “Paris”}`. Override the following methods to load your data splits from their storage location in `DATASET_PATH`:
-	`{“question”: “What is the capital of France?”, “answer”: “Paris”}`. Override the following methods to load your data splits from their storage location in `DATASET_PATH`:
    ```python
    def training_docs(self):
        return #...
@@ -117,7 +116,7 @@ class TaskName(..., MultipleChoiceTask):
 This will require you to format your documents such that they contain `gold` and `choices` fields. They can also have other fields, but those will be ignored by `MultipleChoiceTask`. `choices` should be a list of possible continuations, and `gold` should be an integer specifying the index of the correct completion.
-See [this task](https://github.com/EleutherAI/lm-evaluation-harness/blob/105fa9741ff660f6a62c2eef0d2facfde36dda41/lm_eval/tasks/sat.py#L56) for an example. When used in combination with `HFTask`, it may be useful to override [`_convert_standard`](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/common.py#L28), which will be applied to every document in the HF dataset. See [this task](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/headqa.py) for an example of this.
+See [this task](https://github.com/EleutherAI/lm-evaluation-harness/blob/105fa9741ff660f6a62c2eef0d2facfde36dda41/lm_eval/tasks/sat.py#L56) for an example. When used in combination with `HFTask`, it may be useful to override [`_convert_standard`](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/common.py#L28), which will be applied to every document in the HF dataset. See task](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/headqa.py) for an example of this.
 You can now skip ahead to <a href="#Registering-Your-Task">registering your task</a>.
@@ -125,17 +124,9 @@ You can now skip ahead to <a href="#Registering-Your-Task">registering your task
 <br>
+In the case your task is _not_ multiple-choice, override the following methods for your task class:
-In the case your task is not multiple-choice, override the following methods for your task class:
+Format your document into a single query prompt __without the answer__ here. This method takes a single `doc` example of type `dict` with `str` key-value members. You should concatenate these `doc` item values together into a neatly formatted prompt.
-Put the natural language task description as a single line (no `\n`s) string here. E.g. `"Translate English to French:"`
-```python
-def fewshot_description(self):
-    return ""
-```
-Format your document into a single query prompt __without the answer__ here. This method takes a single `doc` example (in dictionary form) . You should concatenate its members into a nicely formatted prompt.
 ```python
 def doc_to_text(self, doc):
@@ -151,6 +142,41 @@ def doc_to_target(self, doc):
 Understand that the strings from `doc_to_text` and `doc_to_target` will be concatenated together to build up labeled examples in the k-shot setting where k > 0. Design with that in mind 👍.
+### Formatting Prompts
+If you'd like to prepend your few-shot examples with a natural language description or provide a lone custom prompt under a zero-shot setting, you can do this on a per-task basis via the `description_dict` arg of `evaluator.evaluate` which is accessible through the `evaluator` module. This `description_dict` must adhere to the following key-value structure:
+- **key**: the task name as specified in the lm-eval-harness task registry (see the following section on task registry).
+- **value**: the corresponding description/prompt for the task identified by **key**.
+E.g.
+```python
+description_dict = {
+    "task_name_1": "task_name_1 custom prompt or few-shot task description",
+    "task_name_2": "task_name_2 custom prompt or few-shot task description",
+    ...
+}
+```
+At a higher level, one can interface with `evaluator.evaluate` by simply passing a JSON file path to the `description_path` arg of the command-line interface program, `main.py`. The JSON file pointed to should be structured the same way as the aforementioned `description_dict`. E.g. for some file at `/your/path/descriptions.json` you might have:
+```json
+{
+    "cycle_letters": "Please unscramble the letters into a word, and write that word:",
+    "copa": "Given a premise and one alternative with a causal relation to the premise and another without, choose the more plausible alternative"
+}
+```
+which can then be hooked up to the evaluator through the `main.py` CLI as:
+```python
+python main.py  \
+--tasks cycle_letters,copa \
+--description_path /your/path/descriptions.json \
+...
+```
 ### Registering Your Task
 Now's a good time to register your task to expose it for usage. All you'll need to do is import your task module in `lm_eval/tasks/__init__.py` and provide an entry in the `TASK_REGISTRY`  dictionary with the key as the name of your benchmark task (in the form it'll be referred to in the command line) and the value as the task class. See how it's done for other tasks in the [file](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/__init__.py).
@@ -161,7 +187,7 @@ After registering your task, you can now check on your data downloading and veri
 ```bash
 python -m scripts.write_out \
-    --task <your-task> \
+    --tasks <your-task> \
    --output_base_path <path> \
    --sets <train | val | test> \
    --num_fewshot K \

--- a/tests/test_description_option/descriptions.json
+++ b/tests/test_description_option/descriptions.json
+{
+    "hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation."
+}
--- a/tests/test_description_option/test_description_option.py
+++ b/tests/test_description_option/test_description_option.py
+import json
+import argparse
+import lm_eval.tasks
+import lm_eval.models
+from lm_eval.evaluator import evaluate
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--description_path', default=None)
+    parser.add_argument('--num_fewshot', type=int, default=0)
+    parser.add_argument('--limit', type=int, default=None)
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    task_names = ['hellaswag', 'copa']
+    task_dict = lm_eval.tasks.get_task_dict(task_names)
+    lm = lm_eval.models.get_model('dummy')()
+    description_dict = {}
+    if args.description_path:
+        with open(args.description_path, 'r') as f:
+            description_dict = json.load(f)
+    num_fewshot = args.num_fewshot
+    results = evaluate(
+        lm,
+        task_dict,
+        num_fewshot,
+        args.limit,
+        description_dict
+    )
+if __name__ == '__main__':
+    main()
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -47,8 +47,8 @@ def test_evaluator(taskname, Task):
    lm.loglikelihood_rolling = ll_perp_fn
    limit = 10
-    e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+    e1 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
-    e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+    e2 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
    # check taht caching is working
    assert e1 == e2
--- a/tests/test_version_stable.py
+++ b/tests/test_version_stable.py
@@ -81,5 +81,5 @@ def test_versions_stable(taskname, Task):
    lm.greedy_until = greedy_until
    limit = None
-    res = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10)
+    res = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
    assert_target(f"{taskname}-v{Task.VERSION}-res", res)