Commit 8ac99269 authored by Jonathan Tow's avatar Jonathan Tow
Browse files

Replace the `fewshot_description` API with a `description_dict` based interface

parent b67aec37
import json
import numpy as np
import random
import logging
from lm_eval import models, tasks, evaluator, base
logging.getLogger("openai").setLevel(logging.WARNING)
fewshot_descriptions = [
"foo",
"bar"
]
task = "lambada"
num_fewshot = 0
model = "gpt2"
model_args = ""
limit = None
no_cache = False
class CustomDescTask:
def __init__(self, task, desc):
self.task = task
self.desc = desc
def fewshot_description():
return self.desc
self.task.fewshot_description = fewshot_description
def __getattr__(self, attr):
return getattr(self.task, attr)
def main():
random.seed(42)
np.random.seed(42)
lm = models.get_model(model).create_from_arg_string(model_args)
if limit:
print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
if not no_cache:
lm = base.CachingLM(lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_') + '.db')
task_dict = tasks.get_task_dict([task])
for desc in fewshot_descriptions:
custom_task_dict = {k: CustomDescTask(v, desc) for k, v in task_dict.items()}
results = evaluator.evaluate(lm, custom_task_dict, True, num_fewshot, limit)
dumped = json.dumps(results, indent=2)
print('Description:', desc)
print(dumped)
# MAKE TABLE
from pytablewriter import MarkdownTableWriter
writer = MarkdownTableWriter()
writer.headers = ["Task", "Metric", "Value"]
values = []
for k, dic in results.items():
for m, v in dic.items():
values.append([k, m, '%.4f' % v])
k = ""
writer.value_matrix = values
print(writer.dumps())
if __name__ == "__main__":
main()
...@@ -9,7 +9,6 @@ for tname, Task in tasks.TASK_REGISTRY.items():#[('record', tasks.superglue.ReCo ...@@ -9,7 +9,6 @@ for tname, Task in tasks.TASK_REGISTRY.items():#[('record', tasks.superglue.ReCo
print('#', tname) print('#', tname)
docs = islice(task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct) docs = islice(task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct)
print() print()
print('**Zero-Shot Prompt**:', "\n```\n" + task.fewshot_description() + "\n```\n")
for i in range(ct): for i in range(ct):
print() print()
doc = next(docs) doc = next(docs)
......
import argparse import argparse
import numpy as np import numpy as np
import json
import os import os
import random import random
from lm_eval import tasks from lm_eval import tasks
...@@ -12,7 +13,7 @@ def parse_args(): ...@@ -12,7 +13,7 @@ def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--output_base_path', required=True) parser.add_argument('--output_base_path', required=True)
parser.add_argument('--tasks', default="all_tasks") parser.add_argument('--tasks', default="all_tasks")
parser.add_argument('--provide_description', action="store_true") parser.add_argument('--description_path', default=None)
parser.add_argument('--sets', type=str, default="val") # example: val,test parser.add_argument('--sets', type=str, default="val") # example: val,test
parser.add_argument('--num_fewshot', type=int, default=1) parser.add_argument('--num_fewshot', type=int, default=1)
parser.add_argument('--seed', type=int, default=42) parser.add_argument('--seed', type=int, default=42)
...@@ -29,6 +30,12 @@ def main(): ...@@ -29,6 +30,12 @@ def main():
else: else:
task_names = args.tasks.split(",") task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names) task_dict = tasks.get_task_dict(task_names)
description_dict = {}
if args.description_path:
with open(args.description_path, 'r') as f:
description_dict = json.load(f)
os.makedirs(args.output_base_path, exist_ok=True) os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
rnd = random.Random() rnd = random.Random()
...@@ -47,14 +54,16 @@ def main(): ...@@ -47,14 +54,16 @@ def main():
docs = join_iters(iters) docs = join_iters(iters)
description = description_dict[task_name] if description_dict and task_name in description_dict else ""
with open(os.path.join(args.output_base_path, task_name), "w") as f: with open(os.path.join(args.output_base_path, task_name), "w") as f:
for i, doc in zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs): for i, doc in zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs):
f.write(EXAMPLE_DIVIDER.format(i=i)) f.write(EXAMPLE_DIVIDER.format(i=i))
ctx = task.fewshot_context( ctx = task.fewshot_context(
doc=doc, doc=doc,
provide_description=args.provide_description,
num_fewshot=args.num_fewshot, num_fewshot=args.num_fewshot,
rnd=rnd rnd=rnd,
description=description
) )
f.write(ctx + "\n") f.write(ctx + "\n")
......
...@@ -87,8 +87,7 @@ There are 2 standard approaches we follow for downloading data: ...@@ -87,8 +87,7 @@ There are 2 standard approaches we follow for downloading data:
``` ```
These methods return `True`/`False` whether or not your task dataset provides documents for each split type. __Note__: if the test set doesn't have publicly available labels, please do not put it down as having a test set. These methods return `True`/`False` whether or not your task dataset provides documents for each split type. __Note__: if the test set doesn't have publicly available labels, please do not put it down as having a test set.
Lastly, we need to load the documents. In our terminology, a document (`doc`) is a single natural language data example stored in a Python `dict`. E.g.: Lastly, we need to load the documents. In our terminology, a document (`doc`) is a single natural language data example stored in a Python `dict`. E.g.: `{“question”: “What is the capital of France?”, “answer”: “Paris”}`. Override the following methods to load your data splits from their storage location in `DATASET_PATH`:
`{“question”: “What is the capital of France?”, “answer”: “Paris”}`. Override the following methods to load your data splits from their storage location in `DATASET_PATH`:
```python ```python
def training_docs(self): def training_docs(self):
return #... return #...
...@@ -117,7 +116,7 @@ class TaskName(..., MultipleChoiceTask): ...@@ -117,7 +116,7 @@ class TaskName(..., MultipleChoiceTask):
This will require you to format your documents such that they contain `gold` and `choices` fields. They can also have other fields, but those will be ignored by `MultipleChoiceTask`. `choices` should be a list of possible continuations, and `gold` should be an integer specifying the index of the correct completion. This will require you to format your documents such that they contain `gold` and `choices` fields. They can also have other fields, but those will be ignored by `MultipleChoiceTask`. `choices` should be a list of possible continuations, and `gold` should be an integer specifying the index of the correct completion.
See [this task](https://github.com/EleutherAI/lm-evaluation-harness/blob/105fa9741ff660f6a62c2eef0d2facfde36dda41/lm_eval/tasks/sat.py#L56) for an example. When used in combination with `HFTask`, it may be useful to override [`_convert_standard`](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/common.py#L28), which will be applied to every document in the HF dataset. See [this task](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/headqa.py) for an example of this. See [this task](https://github.com/EleutherAI/lm-evaluation-harness/blob/105fa9741ff660f6a62c2eef0d2facfde36dda41/lm_eval/tasks/sat.py#L56) for an example. When used in combination with `HFTask`, it may be useful to override [`_convert_standard`](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/common.py#L28), which will be applied to every document in the HF dataset. See task](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/headqa.py) for an example of this.
You can now skip ahead to <a href="#Registering-Your-Task">registering your task</a>. You can now skip ahead to <a href="#Registering-Your-Task">registering your task</a>.
...@@ -125,17 +124,9 @@ You can now skip ahead to <a href="#Registering-Your-Task">registering your task ...@@ -125,17 +124,9 @@ You can now skip ahead to <a href="#Registering-Your-Task">registering your task
<br> <br>
In the case your task is _not_ multiple-choice, override the following methods for your task class:
In the case your task is not multiple-choice, override the following methods for your task class: Format your document into a single query prompt __without the answer__ here. This method takes a single `doc` example of type `dict` with `str` key-value members. You should concatenate these `doc` item values together into a neatly formatted prompt.
Put the natural language task description as a single line (no `\n`s) string here. E.g. `"Translate English to French:"`
```python
def fewshot_description(self):
return ""
```
Format your document into a single query prompt __without the answer__ here. This method takes a single `doc` example (in dictionary form) . You should concatenate its members into a nicely formatted prompt.
```python ```python
def doc_to_text(self, doc): def doc_to_text(self, doc):
...@@ -151,6 +142,41 @@ def doc_to_target(self, doc): ...@@ -151,6 +142,41 @@ def doc_to_target(self, doc):
Understand that the strings from `doc_to_text` and `doc_to_target` will be concatenated together to build up labeled examples in the k-shot setting where k > 0. Design with that in mind 👍. Understand that the strings from `doc_to_text` and `doc_to_target` will be concatenated together to build up labeled examples in the k-shot setting where k > 0. Design with that in mind 👍.
### Formatting Prompts
If you'd like to prepend your few-shot examples with a natural language description or provide a lone custom prompt under a zero-shot setting, you can do this on a per-task basis via the `description_dict` arg of `evaluator.evaluate` which is accessible through the `evaluator` module. This `description_dict` must adhere to the following key-value structure:
- **key**: the task name as specified in the lm-eval-harness task registry (see the following section on task registry).
- **value**: the corresponding description/prompt for the task identified by **key**.
E.g.
```python
description_dict = {
"task_name_1": "task_name_1 custom prompt or few-shot task description",
"task_name_2": "task_name_2 custom prompt or few-shot task description",
...
}
```
At a higher level, one can interface with `evaluator.evaluate` by simply passing a JSON file path to the `description_path` arg of the command-line interface program, `main.py`. The JSON file pointed to should be structured the same way as the aforementioned `description_dict`. E.g. for some file at `/your/path/descriptions.json` you might have:
```json
{
"cycle_letters": "Please unscramble the letters into a word, and write that word:",
"copa": "Given a premise and one alternative with a causal relation to the premise and another without, choose the more plausible alternative"
}
```
which can then be hooked up to the evaluator through the `main.py` CLI as:
```python
python main.py \
--tasks cycle_letters,copa \
--description_path /your/path/descriptions.json \
...
```
### Registering Your Task ### Registering Your Task
Now's a good time to register your task to expose it for usage. All you'll need to do is import your task module in `lm_eval/tasks/__init__.py` and provide an entry in the `TASK_REGISTRY` dictionary with the key as the name of your benchmark task (in the form it'll be referred to in the command line) and the value as the task class. See how it's done for other tasks in the [file](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/__init__.py). Now's a good time to register your task to expose it for usage. All you'll need to do is import your task module in `lm_eval/tasks/__init__.py` and provide an entry in the `TASK_REGISTRY` dictionary with the key as the name of your benchmark task (in the form it'll be referred to in the command line) and the value as the task class. See how it's done for other tasks in the [file](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/__init__.py).
...@@ -161,7 +187,7 @@ After registering your task, you can now check on your data downloading and veri ...@@ -161,7 +187,7 @@ After registering your task, you can now check on your data downloading and veri
```bash ```bash
python -m scripts.write_out \ python -m scripts.write_out \
--task <your-task> \ --tasks <your-task> \
--output_base_path <path> \ --output_base_path <path> \
--sets <train | val | test> \ --sets <train | val | test> \
--num_fewshot K \ --num_fewshot K \
......
{
"hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation."
}
import json
import argparse
import lm_eval.tasks
import lm_eval.models
from lm_eval.evaluator import evaluate
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--description_path', default=None)
parser.add_argument('--num_fewshot', type=int, default=0)
parser.add_argument('--limit', type=int, default=None)
return parser.parse_args()
def main():
args = parse_args()
task_names = ['hellaswag', 'copa']
task_dict = lm_eval.tasks.get_task_dict(task_names)
lm = lm_eval.models.get_model('dummy')()
description_dict = {}
if args.description_path:
with open(args.description_path, 'r') as f:
description_dict = json.load(f)
num_fewshot = args.num_fewshot
results = evaluate(
lm,
task_dict,
num_fewshot,
args.limit,
description_dict
)
if __name__ == '__main__':
main()
...@@ -47,8 +47,8 @@ def test_evaluator(taskname, Task): ...@@ -47,8 +47,8 @@ def test_evaluator(taskname, Task):
lm.loglikelihood_rolling = ll_perp_fn lm.loglikelihood_rolling = ll_perp_fn
limit = 10 limit = 10
e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) e1 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) e2 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
# check taht caching is working # check taht caching is working
assert e1 == e2 assert e1 == e2
...@@ -81,5 +81,5 @@ def test_versions_stable(taskname, Task): ...@@ -81,5 +81,5 @@ def test_versions_stable(taskname, Task):
lm.greedy_until = greedy_until lm.greedy_until = greedy_until
limit = None limit = None
res = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10) res = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10)
assert_target(f"{taskname}-v{Task.VERSION}-res", res) assert_target(f"{taskname}-v{Task.VERSION}-res", res)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment