Unverified Commit 40f5458f authored by Zafir Stojanovski's avatar Zafir Stojanovski Committed by GitHub
Browse files

Test output table layout consistency (#1916)

* sort metrics in output table

* update docstring in `consolidate_results`

* add tests for verifying consistency of table output

* update tests to account for floating point inconsistencies

* updated tests based on `pythia-14m`
parent 59418aac
...@@ -223,7 +223,7 @@ def prepare_print_tasks( ...@@ -223,7 +223,7 @@ def prepare_print_tasks(
def consolidate_results( def consolidate_results(
eval_tasks: List[TaskOutput], eval_tasks: List[TaskOutput],
) -> Tuple[dict, dict, dict, dict, dict]: ) -> Tuple[dict, dict, dict, dict, dict, dict]:
""" """
@param eval_tasks: list(TaskOutput). @param eval_tasks: list(TaskOutput).
@return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot. @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
...@@ -240,6 +240,8 @@ def consolidate_results( ...@@ -240,6 +240,8 @@ def consolidate_results(
- configs: A defaultdict with task names as keys and task configurations as values. - configs: A defaultdict with task names as keys and task configurations as values.
- versions: A defaultdict with task names as keys and task versions as values. - versions: A defaultdict with task names as keys and task versions as values.
- num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values. - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
- higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better
for each metric as values.
The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple. The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
""" """
......
...@@ -300,7 +300,11 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True): ...@@ -300,7 +300,11 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
if "alias" in dic: if "alias" in dic:
k = dic.pop("alias") k = dic.pop("alias")
for (mf), v in dic.items(): metric_items = dic.items()
if sort_results:
metric_items = sorted(metric_items)
for (mf), v in metric_items:
m, _, f = mf.partition(",") m, _, f = mf.partition(",")
if m.endswith("_stderr"): if m.endswith("_stderr"):
continue continue
......
import os import os
import re
from typing import List from typing import List
import pytest import pytest
...@@ -6,6 +7,7 @@ import pytest ...@@ -6,6 +7,7 @@ import pytest
import lm_eval.api as api import lm_eval.api as api
import lm_eval.evaluator as evaluator import lm_eval.evaluator as evaluator
from lm_eval import tasks from lm_eval import tasks
from lm_eval.utils import make_table
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
...@@ -75,3 +77,73 @@ def test_evaluator( ...@@ -75,3 +77,73 @@ def test_evaluator(
x == y x == y
for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()]) for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
) )
@pytest.mark.parametrize(
"task_name,limit,model,model_args",
[
(
["ai2_arc"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["lambada_openai"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
(
["wikitext"],
10,
"hf",
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
),
],
)
def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str):
results = evaluator.simple_evaluate(
model=model,
tasks=task_name,
limit=limit,
model_args=model_args,
bootstrap_iters=0,
random_seed=0,
numpy_random_seed=0,
torch_random_seed=0,
fewshot_random_seed=0,
)
filename = "_".join(
(
"-".join(task_name),
str(limit),
str(model),
re.sub(r"[^a-zA-Z0-9_\-\.]", "-", model_args),
)
)
filepath = f"./tests/testdata/{filename}.txt"
with open(filepath, "r") as f:
t1 = f.read().strip()
t2 = make_table(results).strip()
t1_lines, t2_lines = t1.splitlines(), t2.splitlines()
assert len(t1_lines) == len(t2_lines)
for t1_line, t2_line in zip(t1_lines, t2_lines):
t1_items, t2_items = t1_line.split("|"), t2_line.split("|")
assert len(t1_items) == len(t2_items)
for t1_item, t2_item in zip(t1_items, t2_items):
try:
t1_item = float(t1_item)
t2_item = float(t2_item)
assert abs(t1_item - t2_item) < 0.1
except ValueError:
assert t1_item == t2_item
| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|
|----------------|-------|------|-----:|--------|---|----:|---|------|
|ai2_arc |N/A |none | 0|acc |↑ | 0.15|± |N/A |
| | |none | 0|acc_norm|↑ | 0.05|± |N/A |
| - arc_challenge| 1|none | 0|acc |↑ | 0.00|± |N/A |
| | |none | 0|acc_norm|↑ | 0.00|± |N/A |
| - arc_easy | 1|none | 0|acc |↑ | 0.30|± |N/A |
| | |none | 0|acc_norm|↑ | 0.10|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------------|------:|------|-----:|----------|---|-------:|---|------|
|lambada_openai| 1|none | 0|acc |↑ | 0.1000|± |N/A |
| | |none | 0|perplexity|↓ |605.4879|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot|Metric| |Value| |Stderr|
|----------------|------:|------|-----:|------|---|----:|---|------|
|abstract_algebra| 0|none | 0|acc |↑ | 0.2|± |N/A |
|global_facts | 0|none | 0|acc |↑ | 0.2|± |N/A |
|public_relations| 0|none | 0|acc |↑ | 0.2|± |N/A |
\ No newline at end of file
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------|------:|------|-----:|---------------|---|-------:|---|------|
|wikitext| 2|none | 0|bits_per_byte |↓ | 1.3394|± |N/A |
| | |none | 0|byte_perplexity|↓ | 2.5304|± |N/A |
| | |none | 0|word_perplexity|↓ |130.4812|± |N/A |
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment