Merge branch 'main' into multimodal-prototyping

e4db76cb · haileyschoelkopf · 6cc6e9cd · ad80f555 · e4db76cb · e4db76cb
Commit e4db76cb authored Jul 09, 2024 by haileyschoelkopf
11 changed files
--- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
 # This file will be included in the generated language-specific task configs.
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
-group:
-  - xwinograd
 dataset_path: Muennighoff/xwinograd
 dataset_name: null  # Overridden by language-specific config.
 output_type: multiple_choice

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -308,7 +308,7 @@ class Reorderer:
        return res


-def make_table(result_dict, column: str = "results", sort_results: bool = True):
+def make_table(result_dict, column: str = "results", sort_results: bool = False):
    """Generate table of results."""
    from pytablewriter import LatexTableWriter, MarkdownTableWriter

@@ -338,20 +338,21 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):

    keys = result_dict[column].keys()
    if sort_results:
-        # sort entries alphabetically
+        # sort entries alphabetically by task or group name.
+        # NOTE: we default here to false, because order matters for multi-level table printing a la mmlu.
+        # sorting here would mess that up
        keys = sorted(keys)
    for k in keys:
        dic = result_dict[column][k]
-        version = result_dict["versions"].get(k, "N/A")
-        n = str(result_dict["n-shot"][k])
+        version = result_dict["versions"].get(k, "    N/A")
+        n = str(result_dict.get("n-shot", " ").get(k, " "))
        higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})

        if "alias" in dic:
            k = dic.pop("alias")

        metric_items = dic.items()
-        if sort_results:
-            metric_items = sorted(metric_items)
+        metric_items = sorted(metric_items)

        for (mf), v in metric_items:
            m, _, f = mf.partition(",")
@@ -362,8 +363,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):

            if m + "_stderr" + "," + f in dic:
                se = dic[m + "_stderr" + "," + f]
-                if se != "N/A":
-                    se = "%.4f" % se
+                se = "   N/A" if se == "N/A" else "%.4f" % se
                values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
            else:
                values.append([k, version, f, n, m, hib, "%.4f" % v, "", ""])

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,7 +76,6 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"]
 vllm = ["vllm>=0.4.2"]
 zeno = ["pandas", "zeno-client"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
-unitxt = ["unitxt"]
 all = [
    "lm_eval[anthropic]",
    "lm_eval[dev]",
@@ -95,7 +94,6 @@ all = [
    "lm_eval[vllm]",
    "lm_eval[zeno]",
    "lm_eval[wandb]",
-    "lm_eval[unitxt]"
 ]

 [tool.ruff.lint]

--- a/templates/new_yaml_task/README.md
+++ b/templates/new_yaml_task/README.md
@@ -17,12 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
 BibTeX-formatted citation goes here
 ```

-### Groups and Tasks
+### Groups, Tags, and Tasks

 #### Groups

 * `group_name`: `Short description`

+#### Tags
+
+* `tag_name`: `Short description`
+
 #### Tasks

 * `task_name`: `1-sentence description of what this particular task does`

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -90,7 +90,7 @@ def test_evaluator(
            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
        ),
        (
-            ["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
+            ["mmlu_stem"],
            10,
            "hf",
            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",

--- a/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
-|     Tasks      |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
-|----------------|-------|------|-----:|--------|---|----:|---|------|
-|ai2_arc         |N/A    |none  |     0|acc     |↑  | 0.15|±  |N/A   |
-|                |       |none  |     0|acc_norm|↑  | 0.05|±  |N/A   |
-| - arc_challenge|      1|none  |     0|acc     |↑  | 0.00|±  |N/A   |
-|                |       |none  |     0|acc_norm|↑  | 0.00|±  |N/A   |
-| - arc_easy     |      1|none  |     0|acc     |↑  | 0.30|±  |N/A   |
-|                |       |none  |     0|acc_norm|↑  | 0.10|±  |N/A   |
\ No newline at end of file
+|    Tasks    |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
+|-------------|------:|------|-----:|--------|---|----:|---|------|
+|arc_challenge|      1|none  |     0|acc     |↑  |  0.0|±  |   N/A|
+|             |       |none  |     0|acc_norm|↑  |  0.0|±  |   N/A|
+|arc_easy     |      1|none  |     0|acc     |↑  |  0.3|±  |   N/A|
+|             |       |none  |     0|acc_norm|↑  |  0.1|±  |   N/A|
\ No newline at end of file
--- a/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
 |    Tasks     |Version|Filter|n-shot|  Metric  |   | Value  |   |Stderr|
 |--------------|------:|------|-----:|----------|---|-------:|---|------|
-|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |N/A   |
-|              |       |none  |     0|perplexity|↓  |605.4879|±  |N/A   |
\ No newline at end of file
+|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |   N/A|
+|              |       |none  |     0|perplexity|↓  |605.3866|±  |   N/A|
\ No newline at end of file
--- a/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
-|     Tasks      |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
-|----------------|------:|------|-----:|------|---|----:|---|------|
-|abstract_algebra|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
-|global_facts    |      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
-|public_relations|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|             Tasks             |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|-------------------------------|------:|------|-----:|------|---|-----:|---|------|
+|stem                           |      1|none  |      |acc   |↑  |0.2474|±  |   N/A|
+| - abstract_algebra            |      0|none  |     0|acc   |↑  |0.2000|±  |   N/A|
+| - anatomy                     |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - astronomy                   |      0|none  |     0|acc   |↑  |0.1000|±  |   N/A|
+| - college_biology             |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - college_chemistry           |      0|none  |     0|acc   |↑  |0.1000|±  |   N/A|
+| - college_computer_science    |      0|none  |     0|acc   |↑  |0.2000|±  |   N/A|
+| - college_mathematics         |      0|none  |     0|acc   |↑  |0.2000|±  |   N/A|
+| - college_physics             |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - computer_security           |      0|none  |     0|acc   |↑  |0.5000|±  |   N/A|
+| - conceptual_physics          |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - electrical_engineering      |      0|none  |     0|acc   |↑  |0.4000|±  |   N/A|
+| - elementary_mathematics      |      0|none  |     0|acc   |↑  |0.0000|±  |   N/A|
+| - high_school_biology         |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - high_school_chemistry       |      0|none  |     0|acc   |↑  |0.4000|±  |   N/A|
+| - high_school_computer_science|      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - high_school_mathematics     |      0|none  |     0|acc   |↑  |0.2000|±  |   N/A|
+| - high_school_physics         |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
+| - high_school_statistics      |      0|none  |     0|acc   |↑  |0.0000|±  |   N/A|
+| - machine_learning            |      0|none  |     0|acc   |↑  |0.3000|±  |   N/A|
\ No newline at end of file
--- a/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
 | Tasks  |Version|Filter|n-shot|    Metric     |   | Value  |   |Stderr|
 |--------|------:|------|-----:|---------------|---|-------:|---|------|
-|wikitext|      2|none  |     0|bits_per_byte  |↓  |  1.3394|±  |N/A   |
-|        |       |none  |     0|byte_perplexity|↓  |  2.5304|±  |N/A   |
-|        |       |none  |     0|word_perplexity|↓  |130.4812|±  |N/A   |
\ No newline at end of file
+|wikitext|      2|none  |     0|bits_per_byte  |↓  |  1.3394|±  |   N/A|
+|        |       |none  |     0|byte_perplexity|↓  |  2.5304|±  |   N/A|
+|        |       |none  |     0|word_perplexity|↓  |130.4801|±  |   N/A|
\ No newline at end of file
--- a/tests/testyamls/test-01.yaml
+++ b/tests/testyamls/test-01.yaml
+group: test-1
+group_alias: test 1
+task:
+  - piqa # string task
+  - ai2_arc # string tag
+  - task: super-glue-lm-eval-v1 # Should this be spread out?
+    num_fewshot: 3
+  - task: swag # dict registered task
+    num_fewshot: 2
+  - task: mmlu
+    num_fewshot: 5
+  - group: nli-tasks # dict group
+    task:
+      - anli
+      - boolq
+      - sglue_rte
+    num_fewshot: 4
+    metric_list:
+      - metric: brier_score
+  - task: sciq # dict registered task duplicate
+    task_alias: sciq 2-shot
+    num_fewshot: 2
+  - task: sciq # dict registered task duplicate
+    task_alias: sciq 4-shot
+    num_fewshot: 4
+  - task: sciq # dict registered task duplicate
+    task_alias: sciq 6-shot
+    num_fewshot: 6
+  - task: siqa_custom # dict task
+    dataset_path: social_i_qa
+    dataset_name: null
+    output_type: multiple_choice
+    training_split: train
+    validation_split: validation
+    doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
+    target_delimiter: " "
+    doc_to_choice:
+      - "{{answerA}}"
+      - "{{answerB}}"
+      - "{{answerC}}"
+    doc_to_target: "{{ (label|int) - 1 }}"
+    metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true