Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into versioning

52f75f0e · lintangsutawika · 331d7c51 · b072bb0d · 52f75f0e · 52f75f0e
Commit 52f75f0e authored Nov 28, 2023 by lintangsutawika
12 changed files
--- a/lm_eval/tasks/blimp/tough_vs_raising_1.yaml
+++ b/lm_eval/tasks/blimp/tough_vs_raising_1.yaml
 # Generated by utils.py
 dataset_name: tough_vs_raising_1
-include: template_yaml
+include: _template_yaml
 task: blimp_tough_vs_raising_1
--- a/lm_eval/tasks/blimp/tough_vs_raising_2.yaml
+++ b/lm_eval/tasks/blimp/tough_vs_raising_2.yaml
 # Generated by utils.py
 dataset_name: tough_vs_raising_2
-include: template_yaml
+include: _template_yaml
 task: blimp_tough_vs_raising_2
--- a/lm_eval/tasks/blimp/transitive.yaml
+++ b/lm_eval/tasks/blimp/transitive.yaml
 # Generated by utils.py
 dataset_name: transitive
-include: template_yaml
+include: _template_yaml
 task: blimp_transitive
--- a/lm_eval/tasks/blimp/wh_island.yaml
+++ b/lm_eval/tasks/blimp/wh_island.yaml
 # Generated by utils.py
 dataset_name: wh_island
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_island
--- a/lm_eval/tasks/blimp/wh_questions_object_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_questions_object_gap.yaml
 # Generated by utils.py
 dataset_name: wh_questions_object_gap
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_questions_object_gap
--- a/lm_eval/tasks/blimp/wh_questions_subject_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_questions_subject_gap.yaml
 # Generated by utils.py
 dataset_name: wh_questions_subject_gap
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_questions_subject_gap
--- a/lm_eval/tasks/blimp/wh_questions_subject_gap_long_distance.yaml
+++ b/lm_eval/tasks/blimp/wh_questions_subject_gap_long_distance.yaml
 # Generated by utils.py
 dataset_name: wh_questions_subject_gap_long_distance
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_questions_subject_gap_long_distance
--- a/lm_eval/tasks/blimp/wh_vs_that_no_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_no_gap.yaml
 # Generated by utils.py
 dataset_name: wh_vs_that_no_gap
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_vs_that_no_gap
--- a/lm_eval/tasks/blimp/wh_vs_that_no_gap_long_distance.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_no_gap_long_distance.yaml
 # Generated by utils.py
 dataset_name: wh_vs_that_no_gap_long_distance
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_vs_that_no_gap_long_distance
--- a/lm_eval/tasks/blimp/wh_vs_that_with_gap.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_with_gap.yaml
 # Generated by utils.py
 dataset_name: wh_vs_that_with_gap
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_vs_that_with_gap
--- a/lm_eval/tasks/blimp/wh_vs_that_with_gap_long_distance.yaml
+++ b/lm_eval/tasks/blimp/wh_vs_that_with_gap_long_distance.yaml
 # Generated by utils.py
 dataset_name: wh_vs_that_with_gap_long_distance
-include: template_yaml
+include: _template_yaml
 task: blimp_wh_vs_that_with_gap_long_distance
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -339,31 +339,27 @@ def make_table(result_dict, column: str = "results"):
    elif column == "groups":
        column_name = "Groups"

-    md_writer = MarkdownTableWriter()
-    latex_writer = LatexTableWriter()
-    md_writer.headers = [
-        column_name,
-        "Version",
-        "Filter",
-        "Metric",
-        "Value",
-        "",
-        "Stderr",
-    ]
-    latex_writer.headers = [
+    all_headers = [
        column_name,
        "Version",
        "Filter",
+        "n-shot",
        "Metric",
        "Value",
        "",
        "Stderr",
    ]

+    md_writer = MarkdownTableWriter()
+    latex_writer = LatexTableWriter()
+    md_writer.headers = all_headers
+    latex_writer.headers = all_headers
+
    values = []

    for k, dic in result_dict[column].items():
        version = result_dict["versions"][k]
+        n = str(result_dict["n-shot"][k])

        if "alias" in dic:
            k = dic.pop("alias")
@@ -375,9 +371,9 @@ def make_table(result_dict, column: str = "results"):

            if m + "_stderr" + "," + f in dic:
                se = dic[m + "_stderr" + "," + f]
-                values.append([k, version, f, m, "%.4f" % v, "±", "%.4f" % se])
+                values.append([k, version, f, n, m, "%.4f" % v, "±", "%.4f" % se])
            else:
-                values.append([k, version, f, m, "%.4f" % v, "", ""])
+                values.append([k, version, f, n, m, "%.4f" % v, "", ""])
            k = ""
            version = ""
    md_writer.value_matrix = values