Test output table layout consistency (#1916)

* sort metrics in output table * update docstring in `consolidate_results` * add tests for verifying consistency of table output * update tests to account for floating point inconsistencies * updated tests based on `pythia-14m`

Test output table layout consistency (#1916)
* sort metrics in output table * update docstring in `consolidate_results` * add tests for verifying consistency of table output * update tests to account for floating point inconsistencies * updated tests based on `pythia-14m`
40f5458f · Zafir Stojanovski · GitHub · 59418aac · 40f5458f · 40f5458f
Unverified Commit 40f5458f authored Jun 07, 2024 by Zafir Stojanovski Committed by GitHub Jun 07, 2024
7 changed files
--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -223,7 +223,7 @@ def prepare_print_tasks(
 def consolidate_results(
    eval_tasks: List[TaskOutput],
-) -> Tuple[dict, dict, dict, dict, dict]:
+) -> Tuple[dict, dict, dict, dict, dict, dict]:
    """
    @param eval_tasks: list(TaskOutput).
    @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
@@ -240,6 +240,8 @@ def consolidate_results(
    - configs: A defaultdict with task names as keys and task configurations as values.
    - versions: A defaultdict with task names as keys and task versions as values.
    - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
+    - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better
+    for each metric as values.
    The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
    """

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -300,7 +300,11 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
        if "alias" in dic:
            k = dic.pop("alias")
-        for (mf), v in dic.items():
+        metric_items = dic.items()
+        if sort_results:
+            metric_items = sorted(metric_items)
+        for (mf), v in metric_items:
            m, _, f = mf.partition(",")
            if m.endswith("_stderr"):
                continue

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
 import os
+import re
 from typing import List
 import pytest
@@ -6,6 +7,7 @@ import pytest
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
 from lm_eval import tasks
+from lm_eval.utils import make_table
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -75,3 +77,73 @@ def test_evaluator(
        x == y
        for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
    )
+@pytest.mark.parametrize(
+    "task_name,limit,model,model_args",
+    [
+        (
+            ["ai2_arc"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["lambada_openai"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["wikitext"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+    ],
+)
+def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str):
+    results = evaluator.simple_evaluate(
+        model=model,
+        tasks=task_name,
+        limit=limit,
+        model_args=model_args,
+        bootstrap_iters=0,
+        random_seed=0,
+        numpy_random_seed=0,
+        torch_random_seed=0,
+        fewshot_random_seed=0,
+    )
+    filename = "_".join(
+        (
+            "-".join(task_name),
+            str(limit),
+            str(model),
+            re.sub(r"[^a-zA-Z0-9_\-\.]", "-", model_args),
+        )
+    )
+    filepath = f"./tests/testdata/{filename}.txt"
+    with open(filepath, "r") as f:
+        t1 = f.read().strip()
+    t2 = make_table(results).strip()
+    t1_lines, t2_lines = t1.splitlines(), t2.splitlines()
+    assert len(t1_lines) == len(t2_lines)
+    for t1_line, t2_line in zip(t1_lines, t2_lines):
+        t1_items, t2_items = t1_line.split("|"), t2_line.split("|")
+        assert len(t1_items) == len(t2_items)
+        for t1_item, t2_item in zip(t1_items, t2_items):
+            try:
+                t1_item = float(t1_item)
+                t2_item = float(t2_item)
+                assert abs(t1_item - t2_item) < 0.1
+            except ValueError:
+                assert t1_item == t2_item
--- a/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|     Tasks      |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
+|----------------|-------|------|-----:|--------|---|----:|---|------|
+|ai2_arc         |N/A    |none  |     0|acc     |↑  | 0.15|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.05|±  |N/A   |
+| - arc_challenge|      1|none  |     0|acc     |↑  | 0.00|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.00|±  |N/A   |
+| - arc_easy     |      1|none  |     0|acc     |↑  | 0.30|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.10|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|    Tasks     |Version|Filter|n-shot|  Metric  |   | Value  |   |Stderr|
+|--------------|------:|------|-----:|----------|---|-------:|---|------|
+|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |N/A   |
+|              |       |none  |     0|perplexity|↓  |605.4879|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|     Tasks      |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
+|----------------|------:|------|-----:|------|---|----:|---|------|
+|abstract_algebra|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
+|global_facts    |      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
+|public_relations|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+| Tasks  |Version|Filter|n-shot|    Metric     |   | Value  |   |Stderr|
+|--------|------:|------|-----:|---------------|---|-------:|---|------|
+|wikitext|      2|none  |     0|bits_per_byte  |↓  |  1.3394|±  |N/A   |
+|        |       |none  |     0|byte_perplexity|↓  |  2.5304|±  |N/A   |
+|        |       |none  |     0|word_perplexity|↓  |130.4812|±  |N/A   |
\ No newline at end of file