Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt

Merge branch 'group-agg-rework' of...
Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
88486e57 · lintangsutawika · 5971f2ca · ba73d131 · 88486e57 · 88486e57
Commit 88486e57 authored Jul 05, 2024 by lintangsutawika
8 changed files
--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
 from typing import List

 import pytest
-import torch

 from lm_eval import tasks
 from lm_eval.api.instance import Instance
@@ -11,7 +10,7 @@ task_manager = tasks.TaskManager()


 @pytest.mark.skip(reason="requires CUDA")
-class TEST_VLLM:
+class Test_VLLM:
    vllm = pytest.importorskip("vllm")
    try:
        from lm_eval.models.vllm_causallms import VLLM
@@ -19,7 +18,7 @@ class TEST_VLLM:
        LM = VLLM(pretrained="EleutherAI/pythia-70m")
    except ModuleNotFoundError:
        pass
-    torch.use_deterministic_algorithms(True)
+    # torch.use_deterministic_algorithms(True)
    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
    multiple_choice_task = task_list["arc_easy"]  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
 import os
+import re
 from typing import List

 import pytest
@@ -6,6 +7,7 @@ import pytest
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
 from lm_eval import tasks
+from lm_eval.utils import make_table


 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -31,6 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
            10000,
        ),
    ],
+    ids=lambda d: f"{d}",
 )
 def test_evaluator(
    task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int
@@ -75,3 +78,74 @@ def test_evaluator(
        x == y
        for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
    )
+
+
+@pytest.mark.parametrize(
+    "task_name,limit,model,model_args",
+    [
+        (
+            ["ai2_arc"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["lambada_openai"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["wikitext"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+    ],
+    ids=lambda d: f"{d}",
+)
+def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str):
+    results = evaluator.simple_evaluate(
+        model=model,
+        tasks=task_name,
+        limit=limit,
+        model_args=model_args,
+        bootstrap_iters=0,
+        random_seed=0,
+        numpy_random_seed=0,
+        torch_random_seed=0,
+        fewshot_random_seed=0,
+    )
+
+    filename = "_".join(
+        (
+            "-".join(task_name),
+            str(limit),
+            str(model),
+            re.sub(r"[^a-zA-Z0-9_\-\.]", "-", model_args),
+        )
+    )
+    filepath = f"./tests/testdata/{filename}.txt"
+    with open(filepath, "r") as f:
+        t1 = f.read().strip()
+
+    t2 = make_table(results).strip()
+
+    t1_lines, t2_lines = t1.splitlines(), t2.splitlines()
+    assert len(t1_lines) == len(t2_lines)
+    for t1_line, t2_line in zip(t1_lines, t2_lines):
+        t1_items, t2_items = t1_line.split("|"), t2_line.split("|")
+        assert len(t1_items) == len(t2_items)
+        for t1_item, t2_item in zip(t1_items, t2_items):
+            try:
+                t1_item = float(t1_item)
+                t2_item = float(t2_item)
+                assert abs(t1_item - t2_item) < 0.3
+            except ValueError:
+                assert t1_item == t2_item
--- a/tests/test_include_path.py
+++ b/tests/test_include_path.py
+import os
+
+import pytest
+
+import lm_eval.api as api
+import lm_eval.evaluator as evaluator
+from lm_eval import tasks
+
+
+@pytest.mark.parametrize(
+    "limit,model,model_args",
+    [
+        (
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
+        ),
+    ],
+)
+def test_include_correctness(limit: int, model: str, model_args: str):
+    task_name = ["arc_easy"]
+
+    task_manager = tasks.TaskManager()
+    task_dict = tasks.get_task_dict(task_name, task_manager)
+
+    e1 = evaluator.simple_evaluate(
+        model=model,
+        tasks=task_name,
+        limit=limit,
+        model_args=model_args,
+    )
+    assert e1 is not None
+
+    # run with evaluate() and "arc_easy" test config (included from ./testconfigs path)
+    lm = api.registry.get_model(model).create_from_arg_string(
+        model_args,
+        {
+            "batch_size": None,
+            "max_batch_size": None,
+            "device": None,
+        },
+    )
+
+    task_name = ["arc_easy"]
+
+    task_manager = tasks.TaskManager(
+        include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs",
+        include_defaults=False,
+    )
+    task_dict = tasks.get_task_dict(task_name, task_manager)
+
+    e2 = evaluator.evaluate(
+        lm=lm,
+        task_dict=task_dict,
+        limit=limit,
+    )
+
+    assert e2 is not None
+    # check that caching is working
+
+    def r(x):
+        return x["results"]["arc_easy"]
+
+    assert all(
+        x == y
+        for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
+    )
+
+
+# test that setting include_defaults = False works as expected and that include_path works
+def test_no_include_defaults():
+    task_name = ["arc_easy"]
+
+    task_manager = tasks.TaskManager(
+        include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs",
+        include_defaults=False,
+    )
+    # should succeed, because we've included an 'arc_easy' task from this dir
+    task_dict = tasks.get_task_dict(task_name, task_manager)
+
+    # should fail, since ./testconfigs has no arc_challenge task
+    task_name = ["arc_challenge"]
+    with pytest.raises(KeyError):
+        task_dict = tasks.get_task_dict(task_name, task_manager)  # noqa: F841
+
+
+# test that include_path containing a task shadowing another task's name fails
+# def test_shadowed_name_fails():
+
+#     task_name = ["arc_easy"]
+
+#     task_manager = tasks.TaskManager(include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs")
+#     task_dict = tasks.get_task_dict(task_name, task_manager)
--- a/tests/testconfigs/arc_test.yaml
+++ b/tests/testconfigs/arc_test.yaml
+task: arc_easy
+dataset_path: allenai/ai2_arc
+dataset_name: ARC-Easy
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|     Tasks      |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
+|----------------|-------|------|-----:|--------|---|----:|---|------|
+|ai2_arc         |N/A    |none  |     0|acc     |↑  | 0.15|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.05|±  |N/A   |
+| - arc_challenge|      1|none  |     0|acc     |↑  | 0.00|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.00|±  |N/A   |
+| - arc_easy     |      1|none  |     0|acc     |↑  | 0.30|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.10|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|    Tasks     |Version|Filter|n-shot|  Metric  |   | Value  |   |Stderr|
+|--------------|------:|------|-----:|----------|---|-------:|---|------|
+|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |N/A   |
+|              |       |none  |     0|perplexity|↓  |605.4879|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|     Tasks      |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
+|----------------|------:|------|-----:|------|---|----:|---|------|
+|abstract_algebra|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
+|global_facts    |      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
+|public_relations|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+| Tasks  |Version|Filter|n-shot|    Metric     |   | Value  |   |Stderr|
+|--------|------:|------|-----:|---------------|---|-------:|---|------|
+|wikitext|      2|none  |     0|bits_per_byte  |↓  |  1.3394|±  |N/A   |
+|        |       |none  |     0|byte_perplexity|↓  |  2.5304|±  |N/A   |
+|        |       |none  |     0|word_perplexity|↓  |130.4812|±  |N/A   |
\ No newline at end of file