"vscode:/vscode.git/clone" did not exist on "2ba6da8abc68ea073377407be8fa93826409b9d6"
Commit ee53be21 authored by Jonathan Tow's avatar Jonathan Tow
Browse files

Add `provide_description` arg for backward compat

parent 7a357971
...@@ -11,7 +11,8 @@ import numpy as np ...@@ -11,7 +11,8 @@ import numpy as np
def simple_evaluate(model, model_args, task_names, def simple_evaluate(model, model_args, task_names,
num_fewshot=0, batch_size=None, device=None, num_fewshot=0, batch_size=None, device=None,
no_cache=False, limit=None, bootstrap_iters=100000): no_cache=False, limit=None, bootstrap_iters=100000,
description_dict_path=None):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
:param model: str :param model: str
...@@ -32,6 +33,8 @@ def simple_evaluate(model, model_args, task_names, ...@@ -32,6 +33,8 @@ def simple_evaluate(model, model_args, task_names,
Limit the number of examples per task (only use this for testing) Limit the number of examples per task (only use this for testing)
:param bootstrap_iters: :param bootstrap_iters:
Number of iterations for bootstrap statistics Number of iterations for bootstrap statistics
:param description_dict_path:
Path to a JSON file containing `task_name: description` key-values for custom prompts
:return :return
Dictionary of results Dictionary of results
""" """
...@@ -71,7 +74,7 @@ def simple_evaluate(model, model_args, task_names, ...@@ -71,7 +74,7 @@ def simple_evaluate(model, model_args, task_names,
return results return results
def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000): def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000, description_dict=None):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
:param lm: obj :param lm: obj
...@@ -86,6 +89,8 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -86,6 +89,8 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
Limit the number of examples per task (only use this for testing) Limit the number of examples per task (only use this for testing)
:param bootstrap_iters: :param bootstrap_iters:
Number of iterations for bootstrap statistics Number of iterations for bootstrap statistics
:param description_dict:
Dictionary of task descriptions of the form: `task_name: description`
:return :return
Dictionary of results Dictionary of results
""" """
......
...@@ -237,9 +237,6 @@ class EthicsUtilitarianismOriginal(Ethics): ...@@ -237,9 +237,6 @@ class EthicsUtilitarianismOriginal(Ethics):
for doc in docs: for doc in docs:
yield {"activity": doc[0], "baseline": doc[1], "rating": ""} yield {"activity": doc[0], "baseline": doc[1], "rating": ""}
def fewshot_description(self):
return "Rate how pleasant each of the following activities is on a scale from 1 (very unpleasant) to 10 (very pleasant).\n\n"
def fewshot_examples(self, k, rnd): def fewshot_examples(self, k, rnd):
# Overwriting fewshot examples as k can be max 5 # Overwriting fewshot examples as k can be max 5
assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more." assert k <= 5, "There are only 5 possible shots for this task. Refer to the V2 for more."
...@@ -350,9 +347,6 @@ class EthicsVirtue(Ethics): ...@@ -350,9 +347,6 @@ class EthicsVirtue(Ethics):
def get_prefix(self): def get_prefix(self):
return "virtue/virtue" return "virtue/virtue"
def fewshot_description(self):
return "The following is a list of sentences and traits, along with whether the trait is exhibited in that sentence.\n\n"
def process_doc(self, doc): def process_doc(self, doc):
# Append identifiers before shuffling to calculate exact matches lateron & skip the first element of headers # Append identifiers before shuffling to calculate exact matches lateron & skip the first element of headers
return [x + [i] for i, x in enumerate(doc[1:])] return [x + [i] for i, x in enumerate(doc[1:])]
......
...@@ -114,9 +114,5 @@ class GeneralHendrycksTest(MultipleChoiceTask): ...@@ -114,9 +114,5 @@ class GeneralHendrycksTest(MultipleChoiceTask):
return rnd.sample(list(self._fewshot_docs), k) return rnd.sample(list(self._fewshot_docs), k)
def fewshot_description(self):
subject = self.subject.replace("_", " ")
return f"The following are multiple choice questions (with answers) about {subject}."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
...@@ -36,9 +36,9 @@ class PROST(HFTask, MultipleChoiceTask): ...@@ -36,9 +36,9 @@ class PROST(HFTask, MultipleChoiceTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_context(self, doc, num_fewshot, rnd, description=None): def fewshot_context(self, doc, num_fewshot, provide_description, rnd, description=None):
assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.' assert num_fewshot == 0, 'PROST is designed to probe models in a zero-shot fashion only.'
return super().fewshot_context(doc, num_fewshot, rnd, description) return super().fewshot_context(doc, num_fewshot, provide_description, rnd, description)
def _convert_standard(self, doc): def _convert_standard(self, doc):
out_doc = { out_doc = {
......
...@@ -85,9 +85,9 @@ class TruthfulQAMultipleChoice(Task): ...@@ -85,9 +85,9 @@ class TruthfulQAMultipleChoice(Task):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " return " "
def fewshot_context(self, doc, num_fewshot, rnd, description=None): def fewshot_context(self, doc, num_fewshot, provide_description, rnd, description=None):
assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting." assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting."
return super().fewshot_context(doc, num_fewshot, rnd, description) return super().fewshot_context(doc, num_fewshot, provide_description, rnd, description)
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
...@@ -217,9 +217,9 @@ class TruthfulQAGeneration(Task): ...@@ -217,9 +217,9 @@ class TruthfulQAGeneration(Task):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " return " "
def fewshot_context(self, doc, num_fewshot, rnd, description=None): def fewshot_context(self, doc, num_fewshot, provide_description, rnd, description=None):
assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting." assert num_fewshot == 0, "TruthfulQA is intended only for the zero-shot setting."
return super().fewshot_context(doc, num_fewshot, rnd, description) return super().fewshot_context(doc, num_fewshot, provide_description, rnd, description)
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
......
...@@ -51,7 +51,7 @@ def main(): ...@@ -51,7 +51,7 @@ def main():
values = [] values = []
for taskname in task_list.split(","): for taskname in task_list.split(","):
lm.tokencost = 0 lm.tokencost = 0
evaluator.evaluate(lm, {taskname: tasks.get_task(taskname)()}, 0, None, bootstrap_iters=10) evaluator.evaluate(lm, {taskname: tasks.get_task(taskname)()}, False, 0, None, bootstrap_iters=10)
print(taskname, lm.tokencost) print(taskname, lm.tokencost)
values.append([taskname, lm.tokencost, lm.tokencost / 1000 * 0.0008, lm.tokencost / 1000 * 0.0012, lm.tokencost / 1000 * 0.006, lm.tokencost / 1000 * 0.06]) values.append([taskname, lm.tokencost, lm.tokencost / 1000 * 0.0008, lm.tokencost / 1000 * 0.0012, lm.tokencost / 1000 * 0.006, lm.tokencost / 1000 * 0.06])
......
...@@ -144,7 +144,7 @@ Understand that the strings from `doc_to_text` and `doc_to_target` will be conca ...@@ -144,7 +144,7 @@ Understand that the strings from `doc_to_text` and `doc_to_target` will be conca
### Formatting Prompts ### Formatting Prompts
If you'd like to prepend your few-shot examples with a natural language description or provide a lone custom prompt under a zero-shot setting, you can do this on a per-task basis via the `description_dict` arg of `evaluator.evaluate` which is accessible through the `evaluator` module. This `description_dict` must adhere to the following key-value structure: If you'd like to prepend your few-shot examples with a natural language description or provide a lone custom prompt for a zero-shot task, you can do so on a per-task basis via the `description_dict` arg of `evaluator.evaluate` which is accessible from the `evaluator` module. This `description_dict` must adhere to the following key-value structure:
- **key**: the task name as specified in the lm-eval-harness task registry (see the following section on task registry). - **key**: the task name as specified in the lm-eval-harness task registry (see the following section on task registry).
- **value**: the corresponding description/prompt for the task identified by **key**. - **value**: the corresponding description/prompt for the task identified by **key**.
...@@ -153,13 +153,13 @@ E.g. ...@@ -153,13 +153,13 @@ E.g.
```python ```python
description_dict = { description_dict = {
"task_name_1": "task_name_1 custom prompt or few-shot task description", "task_name_1": "fewshot description",
"task_name_2": "task_name_2 custom prompt or few-shot task description", "task_name_2": "fewshot description",
... ...
} }
``` ```
At a higher level, one can interface with `evaluator.evaluate` by simply passing a JSON file path to the `description_path` arg of the command-line interface program, `main.py`. The JSON file pointed to should be structured the same way as the aforementioned `description_dict`. E.g. for some file at `/your/path/descriptions.json` you might have: One can also interface with `evaluator.evaluate` from a higher level by simply passing a JSON file path to the `description_dict_path` arg of the command-line interface program, `main.py`. The JSON file pointed to should be structured the same way as the aforementioned `description_dict`. E.g. for some file at `/your/path/descriptions.json` you might have:
```json ```json
{ {
...@@ -173,7 +173,7 @@ which can then be hooked up to the evaluator through the `main.py` CLI as: ...@@ -173,7 +173,7 @@ which can then be hooked up to the evaluator through the `main.py` CLI as:
```python ```python
python main.py \ python main.py \
--tasks cycle_letters,copa \ --tasks cycle_letters,copa \
--description_path /your/path/descriptions.json \ --description_dict_path /your/path/descriptions.json \
... ...
``` ```
...@@ -187,11 +187,12 @@ After registering your task, you can now check on your data downloading and veri ...@@ -187,11 +187,12 @@ After registering your task, you can now check on your data downloading and veri
```bash ```bash
python -m scripts.write_out \ python -m scripts.write_out \
--tasks <your-task> \
--output_base_path <path> \ --output_base_path <path> \
--tasks <your-task> \
--sets <train | val | test> \ --sets <train | val | test> \
--num_fewshot K \ --num_fewshot K \
--num_examples N --num_examples N \
--description_dict_path <path>
``` ```
Open the file specified at the `--output_base_path <path>` and ensure it passes Open the file specified at the `--output_base_path <path>` and ensure it passes
......
...@@ -5,15 +5,13 @@ import lm_eval.models ...@@ -5,15 +5,13 @@ import lm_eval.models
from lm_eval.evaluator import evaluate from lm_eval.evaluator import evaluate
def parse_args(): def test_cli_description_dict_path():
parser = argparse.ArgumentParser() def parse_args():
parser.add_argument('--description_path', default=None) parser = argparse.ArgumentParser()
parser.add_argument('--num_fewshot', type=int, default=0) parser.add_argument('--description_dict_path', default=None)
parser.add_argument('--limit', type=int, default=None) parser.add_argument('--num_fewshot', type=int, default=0)
return parser.parse_args() parser.add_argument('--limit', type=int, default=None)
return parser.parse_args()
def main():
args = parse_args() args = parse_args()
task_names = ['hellaswag', 'copa'] task_names = ['hellaswag', 'copa']
...@@ -21,14 +19,15 @@ def main(): ...@@ -21,14 +19,15 @@ def main():
lm = lm_eval.models.get_model('dummy')() lm = lm_eval.models.get_model('dummy')()
description_dict = {} description_dict = {}
if args.description_path: if args.description_dict_path:
with open(args.description_path, 'r') as f: with open(args.description_dict_path, 'r') as f:
description_dict = json.load(f) description_dict = json.load(f)
num_fewshot = args.num_fewshot num_fewshot = args.num_fewshot
results = evaluate( results = evaluate(
lm, lm,
task_dict, task_dict,
False,
num_fewshot, num_fewshot,
args.limit, args.limit,
description_dict description_dict
...@@ -36,4 +35,4 @@ def main(): ...@@ -36,4 +35,4 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
main() test_cli_description_dict_path()
...@@ -48,8 +48,8 @@ def test_evaluator(taskname, task_class): ...@@ -48,8 +48,8 @@ def test_evaluator(taskname, task_class):
lm.loglikelihood_rolling = ll_perp_fn lm.loglikelihood_rolling = ll_perp_fn
limit = 10 limit = 10
e1 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10) e1 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10, description_dict=None)
e2 = evaluator.evaluate(lm, task_dict, 0, limit, description_dict=None, bootstrap_iters=10) e2 = evaluator.evaluate(lm, task_dict, False, 0, limit, bootstrap_iters=10, description_dict=None)
# check that caching is working # check that caching is working
assert e1 == e2 assert e1 == e2
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment