Merge branch 'main' into group-agg-rework

3d1b8f43 · Lintang Sutawika · GitHub · e200c24e · d855d0ba · 3d1b8f43
Unverified Commit 3d1b8f43 authored Jul 03, 2024 by Lintang Sutawika Committed by GitHub Jul 03, 2024
17 changed files
--- a/lm_eval/tasks/tmmluplus/default/_generate_configs.py
+++ b/lm_eval/tasks/tmmluplus/default/_generate_configs.py
 """
 Take in a YAML, and output all "other" splits with this YAML
 """
+
 import argparse
 import os


--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -152,6 +152,55 @@ def general_detokenize(string):
    return string


+def get_file_task_name(filename: str) -> str:
+    """
+    Given the sample results filenames, extracts and returns the task name.
+    """
+    return filename[filename.find("_") + 1 : filename.rfind("_")]
+
+
+def get_file_datetime(filename: str) -> str:
+    """
+    Given the results and sample results filenames, extracts and returns the datetime.
+    """
+    return filename[filename.rfind("_") + 1 :].replace(".json", "")
+
+
+def sanitize_model_name(model_name: str) -> str:
+    """
+    Given the model name, returns a sanitized version of it.
+    """
+    return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
+
+
+def sanitize_task_name(task_name: str) -> str:
+    """
+    Given the task name, returns a sanitized version of it.
+    """
+    return re.sub(r"\W", "_", task_name)
+
+
+def get_latest_filename(filenames: List[str]) -> str:
+    """
+    Given a list of filenames, returns the filename with the latest datetime.
+    """
+    return max(filenames, key=lambda f: get_file_datetime(f))
+
+
+def get_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to aggregated results.
+    """
+    return [f for f in filenames if "/results_" in f and ".json" in f]
+
+
+def get_sample_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to sample results.
+    """
+    return [f for f in filenames if "/samples_" in f and ".json" in f]
+
+
 def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
    """
    - context_len allows for a rolling window context, allowing each prediction window to potentially
@@ -300,7 +349,11 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
        if "alias" in dic:
            k = dic.pop("alias")

-        for (mf), v in dic.items():
+        metric_items = dic.items()
+        if sort_results:
+            metric_items = sorted(metric_items)
+
+        for (mf), v in metric_items:
            m, _, f = mf.partition(",")
            if m.endswith("_stderr"):
                continue

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "lm_eval"
-version = "0.4.2"
+version = "0.4.3"
 authors = [
    {name="EleutherAI", email="contact@eleuther.ai"}
 ]

--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
    the match, splitting the training data into chunks
   3) Any chunks less than `minimum_slice_length` are removed
   4) Training data sets split into more than `too_dirty_cutoff` are considered
-    completey contaminated and removed
+    completely contaminated and removed

 OpenAI used:
 ```

--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -2,6 +2,7 @@
 Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
+
 import json
 import logging
 import os

--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -2,6 +2,7 @@
 Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
+
 import argparse
 import logging


--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -70,6 +70,11 @@ def main():
            if docs is not None:
                iters.append(docs)

+        if len(iters) == 0:
+            raise ValueError(
+                f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
+            )
+
        docs = join_iters(iters)

        with open(

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -7,7 +7,12 @@ from pathlib import Path
 import pandas as pd
 from zeno_client import ZenoClient, ZenoMetric

-from lm_eval.utils import eval_logger
+from lm_eval.utils import (
+    eval_logger,
+    get_latest_filename,
+    get_results_filenames,
+    get_sample_results_filenames,
+)


 def parse_args():
@@ -45,13 +50,15 @@ def main():

    assert len(models) > 0, "No model directories found in the data_path."

+    # Get the tasks from the latest results file of the first model.
    tasks = set(tasks_for_model(models[0], args.data_path))

-    for model in models:  # Make sure that all models have the same tasks.
+    # Get tasks names from the latest results file for each model
+    # Get intersection of tasks for all models
+    for model in models:
        old_tasks = tasks.copy()
        task_count = len(tasks)
-
-        model_tasks = tasks_for_model(model, args.data_path)
+        model_tasks = set(tasks_for_model(model, args.data_path))
        tasks.intersection(set(model_tasks))

        if task_count != len(tasks):
@@ -66,22 +73,36 @@ def main():
    for task in tasks:
        # Upload data for all models
        for model_index, model in enumerate(models):
+            # Get latest results and sample results for a model
+            model_dir = Path(args.data_path, model)
+            model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+            model_results_filenames = get_results_filenames(model_files)
+            model_sample_filenames = get_sample_results_filenames(model_files)
+            latest_results = get_latest_filename(
+                [Path(f).name for f in model_results_filenames]
+            )
+            latest_sample_results = get_latest_filename(
+                [Path(f).name for f in model_sample_filenames if task in f]
+            )
            model_args = re.sub(
                r"[\"<>:/\|\\?\*\[\]]+",
                "__",
                json.load(
-                    open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                    open(Path(args.data_path, model, latest_results), encoding="utf-8")
                )["config"]["model_args"],
            )
+            print(model_args)
+            data = []
            with open(
-                Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
+                Path(args.data_path, model, latest_sample_results),
                "r",
                encoding="utf-8",
            ) as file:
-                data = json.loads(file.read())
+                for line in file:
+                    data.append(json.loads(line.strip()))

            configs = json.load(
-                open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                open(Path(args.data_path, model, latest_results), encoding="utf-8")
            )["configs"]
            config = configs[task]

@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
    Returns:
        list: A list of tasks for the model.
    """
-    dir_path = Path(data_path, model)
-    config = (
-        json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
-    )
+    # get latest model results for a given name
+    model_dir = Path(data_path, model)
+    model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+    model_results_filenames = get_results_filenames(model_files)
+    latest_results = get_latest_filename(model_results_filenames)
+    config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
    return list(config[0].keys())



--- a/tests/models/test_neuralmagic.py
+++ b/tests/models/test_neuralmagic.py
@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
 ]


+@pytest.mark.skip(reason="test failing")
 @pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
 def test_sparseml_eval(model_id, task):
    lm = get_model("sparseml").create_from_arg_string(

--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
 from typing import List

 import pytest
-import torch

 from lm_eval import tasks
 from lm_eval.api.instance import Instance
@@ -11,7 +10,7 @@ task_manager = tasks.TaskManager()


 @pytest.mark.skip(reason="requires CUDA")
-class TEST_VLLM:
+class Test_VLLM:
    vllm = pytest.importorskip("vllm")
    try:
        from lm_eval.models.vllm_causallms import VLLM
@@ -19,7 +18,7 @@ class TEST_VLLM:
        LM = VLLM(pretrained="EleutherAI/pythia-70m")
    except ModuleNotFoundError:
        pass
-    torch.use_deterministic_algorithms(True)
+    # torch.use_deterministic_algorithms(True)
    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
    multiple_choice_task = task_list["arc_easy"]  # type: ignore
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
 import os
+import re
 from typing import List

 import pytest
@@ -6,6 +7,7 @@ import pytest
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
 from lm_eval import tasks
+from lm_eval.utils import make_table


 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -31,6 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
            10000,
        ),
    ],
+    ids=lambda d: f"{d}",
 )
 def test_evaluator(
    task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int
@@ -75,3 +78,74 @@ def test_evaluator(
        x == y
        for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
    )
+
+
+@pytest.mark.parametrize(
+    "task_name,limit,model,model_args",
+    [
+        (
+            ["ai2_arc"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["lambada_openai"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["wikitext"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+    ],
+    ids=lambda d: f"{d}",
+)
+def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str):
+    results = evaluator.simple_evaluate(
+        model=model,
+        tasks=task_name,
+        limit=limit,
+        model_args=model_args,
+        bootstrap_iters=0,
+        random_seed=0,
+        numpy_random_seed=0,
+        torch_random_seed=0,
+        fewshot_random_seed=0,
+    )
+
+    filename = "_".join(
+        (
+            "-".join(task_name),
+            str(limit),
+            str(model),
+            re.sub(r"[^a-zA-Z0-9_\-\.]", "-", model_args),
+        )
+    )
+    filepath = f"./tests/testdata/{filename}.txt"
+    with open(filepath, "r") as f:
+        t1 = f.read().strip()
+
+    t2 = make_table(results).strip()
+
+    t1_lines, t2_lines = t1.splitlines(), t2.splitlines()
+    assert len(t1_lines) == len(t2_lines)
+    for t1_line, t2_line in zip(t1_lines, t2_lines):
+        t1_items, t2_items = t1_line.split("|"), t2_line.split("|")
+        assert len(t1_items) == len(t2_items)
+        for t1_item, t2_item in zip(t1_items, t2_items):
+            try:
+                t1_item = float(t1_item)
+                t2_item = float(t2_item)
+                assert abs(t1_item - t2_item) < 0.3
+            except ValueError:
+                assert t1_item == t2_item
--- a/tests/test_include_path.py
+++ b/tests/test_include_path.py
+import os
+
+import pytest
+
+import lm_eval.api as api
+import lm_eval.evaluator as evaluator
+from lm_eval import tasks
+
+
+@pytest.mark.parametrize(
+    "limit,model,model_args",
+    [
+        (
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
+        ),
+    ],
+)
+def test_include_correctness(limit: int, model: str, model_args: str):
+    task_name = ["arc_easy"]
+
+    task_manager = tasks.TaskManager()
+    task_dict = tasks.get_task_dict(task_name, task_manager)
+
+    e1 = evaluator.simple_evaluate(
+        model=model,
+        tasks=task_name,
+        limit=limit,
+        model_args=model_args,
+    )
+    assert e1 is not None
+
+    # run with evaluate() and "arc_easy" test config (included from ./testconfigs path)
+    lm = api.registry.get_model(model).create_from_arg_string(
+        model_args,
+        {
+            "batch_size": None,
+            "max_batch_size": None,
+            "device": None,
+        },
+    )
+
+    task_name = ["arc_easy"]
+
+    task_manager = tasks.TaskManager(
+        include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs",
+        include_defaults=False,
+    )
+    task_dict = tasks.get_task_dict(task_name, task_manager)
+
+    e2 = evaluator.evaluate(
+        lm=lm,
+        task_dict=task_dict,
+        limit=limit,
+    )
+
+    assert e2 is not None
+    # check that caching is working
+
+    def r(x):
+        return x["results"]["arc_easy"]
+
+    assert all(
+        x == y
+        for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
+    )
+
+
+# test that setting include_defaults = False works as expected and that include_path works
+def test_no_include_defaults():
+    task_name = ["arc_easy"]
+
+    task_manager = tasks.TaskManager(
+        include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs",
+        include_defaults=False,
+    )
+    # should succeed, because we've included an 'arc_easy' task from this dir
+    task_dict = tasks.get_task_dict(task_name, task_manager)
+
+    # should fail, since ./testconfigs has no arc_challenge task
+    task_name = ["arc_challenge"]
+    with pytest.raises(KeyError):
+        task_dict = tasks.get_task_dict(task_name, task_manager)  # noqa: F841
+
+
+# test that include_path containing a task shadowing another task's name fails
+# def test_shadowed_name_fails():
+
+#     task_name = ["arc_easy"]
+
+#     task_manager = tasks.TaskManager(include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs")
+#     task_dict = tasks.get_task_dict(task_name, task_manager)
--- a/tests/testconfigs/arc_test.yaml
+++ b/tests/testconfigs/arc_test.yaml
+task: arc_easy
+dataset_path: allenai/ai2_arc
+dataset_name: ARC-Easy
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|     Tasks      |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
+|----------------|-------|------|-----:|--------|---|----:|---|------|
+|ai2_arc         |N/A    |none  |     0|acc     |↑  | 0.15|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.05|±  |N/A   |
+| - arc_challenge|      1|none  |     0|acc     |↑  | 0.00|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.00|±  |N/A   |
+| - arc_easy     |      1|none  |     0|acc     |↑  | 0.30|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.10|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|    Tasks     |Version|Filter|n-shot|  Metric  |   | Value  |   |Stderr|
+|--------------|------:|------|-----:|----------|---|-------:|---|------|
+|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |N/A   |
+|              |       |none  |     0|perplexity|↓  |605.4879|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+|     Tasks      |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
+|----------------|------:|------|-----:|------|---|----:|---|------|
+|abstract_algebra|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
+|global_facts    |      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
+|public_relations|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
\ No newline at end of file
--- a/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+++ b/tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+| Tasks  |Version|Filter|n-shot|    Metric     |   | Value  |   |Stderr|
+|--------|------:|------|-----:|---------------|---|-------:|---|------|
+|wikitext|      2|none  |     0|bits_per_byte  |↓  |  1.3394|±  |N/A   |
+|        |       |none  |     0|byte_perplexity|↓  |  2.5304|±  |N/A   |
+|        |       |none  |     0|word_perplexity|↓  |130.4812|±  |N/A   |
\ No newline at end of file