Make `evaluate` and `simple_evaluate` description args consistent

10dd7d38 · Jonathan Tow · d1319950 · 10dd7d38 · 10dd7d38 · 10dd7d38
Commit 10dd7d38 authored Dec 15, 2021 by Jonathan Tow
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 11 deletions

lm_eval/evaluator.py lm_eval/evaluator.py +4 -9

main.py main.py +6 -0

task-guide.md task-guide.md +2 -2

No files found.
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -12,7 +12,7 @@ import numpy as np
 def simple_evaluate(model, model_args, task_names,
                    num_fewshot=0, batch_size=None, device=None,
                    no_cache=False, limit=None, bootstrap_iters=100000,
-                    description_dict_path=None):
+                    description_dict=None):
    """Instantiate and evaluate a model on a list of tasks.

    :param model: str
@@ -33,8 +33,8 @@ def simple_evaluate(model, model_args, task_names,
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
-    :param description_dict_path:
-        Path to a JSON file containing `task_name: description` key-values for custom prompts
+    :param description_dict:
+        Dictionary of custom task descriptions of the form: `task_name: description` 
    :return
        Dictionary of results
    """
@@ -52,11 +52,6 @@ def simple_evaluate(model, model_args, task_names,
    
    task_dict = lm_eval.tasks.get_task_dict(task_names)

-    description_dict = {}
-    if description_dict_path:
-        with open(description_dict_path, 'r') as f:
-            description_dict = json.load(f)
-
    results = evaluate(lm, task_dict, False, num_fewshot, limit, description_dict=description_dict)

    # add info about the model and few shot config
@@ -90,7 +85,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :param description_dict:
-        Dictionary of task descriptions of the form: `task_name: description` 
+        Dictionary of custom task descriptions of the form: `task_name: description` 
    :return
        Dictionary of results
    """

--- a/main.py
+++ b/main.py
@@ -35,6 +35,11 @@ def main():
    else:
        task_names = args.tasks.split(",")

+    description_dict = {}
+    if args.description_dict_path:
+        with open(args.description_dict_path, 'r') as f:
+            description_dict = json.load(f)
+
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
@@ -44,6 +49,7 @@ def main():
        device=args.device,
        no_cache=args.no_cache,
        limit=args.limit,
+        description_dict=description_dict
    )

    dumped = json.dumps(results, indent=2)

--- a/task-guide.md
+++ b/task-guide.md
@@ -159,7 +159,7 @@ description_dict = {
 }
 ```

-One can also interface with `evaluator.evaluate` from a higher level by simply passing a JSON file path to the `description_dict_path` arg of the command-line interface program, `main.py`. The JSON file pointed to should be structured the same way as the aforementioned `description_dict`. E.g. for some file at `/your/path/descriptions.json` you might have:
+One can also interface with `evaluator.evaluate`/`evaluator.simple_evaluate` from a higher level by simply passing a JSON file path to the `description_dict_path` arg of the command-line interface (CLI) programs, `main.py` and `write_out.py` . The JSON file pointed to should be structured the same way as the aforementioned `description_dict`. E.g. for some file at `/your/path/descriptions.json` you might have:

 ```json
 {
@@ -168,7 +168,7 @@ One can also interface with `evaluator.evaluate` from a higher level by simply p
 }
 ```

-which can then be hooked up to the evaluator through the `main.py` CLI as:
+which can then be used, for example, in the `main.py` CLI as:

 ```python
 python main.py  \