Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into versioning

52f75f0e · lintangsutawika · 331d7c51 · b072bb0d · 52f75f0e · 52f75f0e
Commit 52f75f0e authored Nov 28, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -81,7 +81,7 @@ class TaskConfig(dict):
    fewshot_delimiter: str = "\n\n"
    fewshot_config: dict = None
    # runtime configuration options
-    num_fewshot: int = 0
+    num_fewshot: int = -1
    # scoring options
    metric_list: list = None
    output_type: str = "generate_until"

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -134,13 +134,17 @@ def simple_evaluate(
            config["generation_kwargs"].update(gen_kwargs)

        if num_fewshot is not None:
-            if config["num_fewshot"] > 0:
+            if config["num_fewshot"] == 0:
+                eval_logger.info(
+                    f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
+                )
+            else:
                default_num_fewshot = config["num_fewshot"]
                eval_logger.warning(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )

-            task_obj._config["num_fewshot"] = num_fewshot
+                task_obj._config["num_fewshot"] = num_fewshot

    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -233,6 +237,8 @@ def evaluate(
    # store the ordering of tasks and groups
    task_order = collections.defaultdict(int)
    task_group_alias = collections.defaultdict(dict)
+    # store num-fewshot value per task
+    num_fewshot = collections.defaultdict(int)

    # get lists of each type of request
    for task_name, task in task_dict.items():
@@ -251,6 +257,12 @@ def evaluate(
        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())

+        if "num_fewshot" in configs[task_name]:
+            n_shot = configs[task_name]["num_fewshot"]
+        else:
+            n_shot = -1
+        num_fewshot[task_name] = n_shot
+
        if "task_alias" in configs[task_name]:
            task_group_alias[task_name] = configs[task_name]["task_alias"]

@@ -612,11 +624,15 @@ def evaluate(
            else:
                groups_agg[group]["alias"] = tab_string + group

+        for group_name, task_list in task_hierarchy.items():
+            num_fewshot[group_name] = num_fewshot[task_list[0]]
+
        results_dict = {
            "results": dict(results_agg.items()),
            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
            "configs": dict(sorted(configs.items())),
            "versions": dict(sorted(versions.items())),
+            "n-shot": dict(sorted(num_fewshot.items())),
        }
        if log_samples:
            results_dict["samples"] = dict(samples)

--- a/lm_eval/tasks/blimp/template_yaml
+++ b/lm_eval/tasks/blimp/template_yaml
@@ -5,6 +5,7 @@ validation_split: train
 doc_to_text: ""
 doc_to_target: 0
 doc_to_choice: "{{[sentence_good, sentence_bad]}}"
+num_fewshot: 0
 should_decontaminate: true
 doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
 metric_list:

--- a/lm_eval/tasks/blimp/adjunct_island.yaml
+++ b/lm_eval/tasks/blimp/adjunct_island.yaml
 # Generated by utils.py
 dataset_name: adjunct_island
-include: template_yaml
+include: _template_yaml
 task: blimp_adjunct_island
--- a/lm_eval/tasks/blimp/anaphor_gender_agreement.yaml
+++ b/lm_eval/tasks/blimp/anaphor_gender_agreement.yaml
 # Generated by utils.py
 dataset_name: anaphor_gender_agreement
-include: template_yaml
+include: _template_yaml
 task: blimp_anaphor_gender_agreement
--- a/lm_eval/tasks/blimp/anaphor_number_agreement.yaml
+++ b/lm_eval/tasks/blimp/anaphor_number_agreement.yaml
 # Generated by utils.py
 dataset_name: anaphor_number_agreement
-include: template_yaml
+include: _template_yaml
 task: blimp_anaphor_number_agreement
--- a/lm_eval/tasks/blimp/animate_subject_passive.yaml
+++ b/lm_eval/tasks/blimp/animate_subject_passive.yaml
 # Generated by utils.py
 dataset_name: animate_subject_passive
-include: template_yaml
+include: _template_yaml
 task: blimp_animate_subject_passive
--- a/lm_eval/tasks/blimp/animate_subject_trans.yaml
+++ b/lm_eval/tasks/blimp/animate_subject_trans.yaml
 # Generated by utils.py
 dataset_name: animate_subject_trans
-include: template_yaml
+include: _template_yaml
 task: blimp_animate_subject_trans
--- a/lm_eval/tasks/blimp/causative.yaml
+++ b/lm_eval/tasks/blimp/causative.yaml
 # Generated by utils.py
 dataset_name: causative
-include: template_yaml
+include: _template_yaml
 task: blimp_causative
--- a/lm_eval/tasks/blimp/complex_NP_island.yaml
+++ b/lm_eval/tasks/blimp/complex_NP_island.yaml
 # Generated by utils.py
 dataset_name: complex_NP_island
-include: template_yaml
+include: _template_yaml
 task: blimp_complex_NP_island
--- a/lm_eval/tasks/blimp/coordinate_structure_constraint_complex_left_branch.yaml
+++ b/lm_eval/tasks/blimp/coordinate_structure_constraint_complex_left_branch.yaml
 # Generated by utils.py
 dataset_name: coordinate_structure_constraint_complex_left_branch
-include: template_yaml
+include: _template_yaml
 task: blimp_coordinate_structure_constraint_complex_left_branch
--- a/lm_eval/tasks/blimp/coordinate_structure_constraint_object_extraction.yaml
+++ b/lm_eval/tasks/blimp/coordinate_structure_constraint_object_extraction.yaml
 # Generated by utils.py
 dataset_name: coordinate_structure_constraint_object_extraction
-include: template_yaml
+include: _template_yaml
 task: blimp_coordinate_structure_constraint_object_extraction
--- a/lm_eval/tasks/blimp/determiner_noun_agreement_1.yaml
+++ b/lm_eval/tasks/blimp/determiner_noun_agreement_1.yaml
 # Generated by utils.py
 dataset_name: determiner_noun_agreement_1
-include: template_yaml
+include: _template_yaml
 task: blimp_determiner_noun_agreement_1
--- a/lm_eval/tasks/blimp/determiner_noun_agreement_2.yaml
+++ b/lm_eval/tasks/blimp/determiner_noun_agreement_2.yaml
 # Generated by utils.py
 dataset_name: determiner_noun_agreement_2
-include: template_yaml
+include: _template_yaml
 task: blimp_determiner_noun_agreement_2
--- a/lm_eval/tasks/blimp/determiner_noun_agreement_irregular_1.yaml
+++ b/lm_eval/tasks/blimp/determiner_noun_agreement_irregular_1.yaml
 # Generated by utils.py
 dataset_name: determiner_noun_agreement_irregular_1
-include: template_yaml
+include: _template_yaml
 task: blimp_determiner_noun_agreement_irregular_1
--- a/lm_eval/tasks/blimp/determiner_noun_agreement_irregular_2.yaml
+++ b/lm_eval/tasks/blimp/determiner_noun_agreement_irregular_2.yaml
 # Generated by utils.py
 dataset_name: determiner_noun_agreement_irregular_2
-include: template_yaml
+include: _template_yaml
 task: blimp_determiner_noun_agreement_irregular_2
--- a/lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_2.yaml
+++ b/lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_2.yaml
 # Generated by utils.py
 dataset_name: determiner_noun_agreement_with_adj_2
-include: template_yaml
+include: _template_yaml
 task: blimp_determiner_noun_agreement_with_adj_2
--- a/lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_1.yaml
+++ b/lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_1.yaml
 # Generated by utils.py
 dataset_name: determiner_noun_agreement_with_adj_irregular_1
-include: template_yaml
+include: _template_yaml
 task: blimp_determiner_noun_agreement_with_adj_irregular_1
--- a/lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_2.yaml
+++ b/lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_2.yaml
 # Generated by utils.py
 dataset_name: determiner_noun_agreement_with_adj_irregular_2
-include: template_yaml
+include: _template_yaml
 task: blimp_determiner_noun_agreement_with_adj_irregular_2
--- a/lm_eval/tasks/blimp/determiner_noun_agreement_with_adjective_1.yaml
+++ b/lm_eval/tasks/blimp/determiner_noun_agreement_with_adjective_1.yaml
 # Generated by utils.py
 dataset_name: determiner_noun_agreement_with_adjective_1
-include: template_yaml
+include: _template_yaml
 task: blimp_determiner_noun_agreement_with_adjective_1