Merge branch 'main' into feature/eval_from_config

84d02f77 · Baber · 15ce554c · fcddf195 · 15ce554c · 84d02f77
Commit 84d02f77 authored Jul 10, 2025 by Baber
9 changed files
--- a/mypy.ini
+++ b/mypy.ini
-[mypy]
-python_version = 3.8
-show_traceback = True
-check_untyped_defs = True
-no_implicit_reexport = True
-warn_unreachable = True
-warn_unused_configs = True
-warn_unused_ignores = True
-warn_redundant_casts = True
-# We ignore errors everywhere to gradually add type annotations
-[mypy-lm_eval.*]
-ignore_errors = True
-[mypy-lm_eval.api.*]
-ignore_errors = True
-[mypy-lm_eval.prompts.*]
-ignore_errors = True
-[mypy-lm_eval.models.*]
-ignore_errors = True
-[mypy-scripts.*]
-ignore_errors = True
-[mypy-main]
-ignore_errors = True
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,8 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 acpbench = ["lark>=1.1.9", "tarski[clingo]==0.8.2", "pddl==0.4.2", "kstar-planner==1.4.2"]
 api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
 audiolm_qwen = ["librosa", "soundfile"]
-deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
-dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt==1.22.0", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
 gptq = ["auto-gptq[triton]>=0.6.0"]
 gptqmodel = ["gptqmodel>=1.0.9"]
 hf_transfer = ["hf_transfer"]
@@ -79,41 +78,20 @@ promptsource = ["promptsource>=0.2.3"]
 ruler = ["nltk", "wonderwords", "scipy"]
 sae_lens = ["sae_lens"]
 sentencepiece = ["sentencepiece>=0.1.98"]
-sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
 sparsify = ["sparsify"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
+unitxt = ["unitxt==1.22.0"]
 vllm = ["vllm>=0.4.2"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
 zeno = ["pandas", "zeno-client"]
-all = [
+tasks = [
    "lm_eval[acpbench]",
-    "lm_eval[api]",
-    "lm_eval[audiolm_qwen]",
-    "lm_eval[deepsparse]",
-    "lm_eval[dev]",
-    "lm_eval[gptq]",
-    "lm_eval[gptqmodel]",
-    "lm_eval[hf_transfer]",
-    "lm_eval[ibm_watsonx_ai]",
    "lm_eval[ifeval]",
-    "lm_eval[ipex]",
    "lm_eval[japanese_leaderboard]",
    "lm_eval[longbench]",
-    "lm_eval[mamba]",
    "lm_eval[math]",
    "lm_eval[multilingual]",
-    "lm_eval[neuronx]",
-    "lm_eval[optimum]",
-    "lm_eval[promptsource]",
    "lm_eval[ruler]",
-    "lm_eval[sae_lens]",
-    "lm_eval[sentencepiece]",
-    "lm_eval[sparseml]",
-    "lm_eval[sparsify]",
-    "lm_eval[testing]",
-    "lm_eval[vllm]",
-    "lm_eval[wandb]",
-    "lm_eval[zeno]",
 ]
 [tool.pymarkdown]

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -4,6 +4,7 @@ import logging
 import os
 import re
 from pathlib import Path
+from typing import Union
 import pandas as pd
 from zeno_client import ZenoClient, ZenoMetric
@@ -35,6 +36,22 @@ def parse_args():
    return parser.parse_args()
+def sanitize_string(model_args_raw: Union[str, dict]) -> str:
+    """Sanitize the model_args string or dict"""
+    # Convert to string if it's a dictionary
+    model_args_str = (
+        json.dumps(model_args_raw)
+        if isinstance(model_args_raw, dict)
+        else model_args_raw
+    )
+    # Apply the sanitization
+    return re.sub(
+        r"[\"<>:/|\\?*\[\]]+",
+        "__",
+        model_args_str,
+    )
 def main():
    """Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
@@ -87,13 +104,16 @@ def main():
            latest_sample_results = get_latest_filename(
                [Path(f).name for f in model_sample_filenames if task in f]
            )
-            model_args = re.sub(
+            # Load the model_args, which can be either a string or a dictionary
-                r"[\"<>:/\|\\?\*\[\]]+",
+            model_args = sanitize_string(
-                "__",
                json.load(
-                    open(Path(args.data_path, model, latest_results), encoding="utf-8")
+                    open(
-                )["config"]["model_args"],
+                        Path(args.data_path, model, latest_results),
+                        encoding="utf-8",
+                    )
+                )["config"]["model_args"]
            )
            print(model_args)
            data = []
            with open(

--- a/templates/example_ci_config.yaml
+++ b/templates/example_ci_config.yaml
@@ -4,10 +4,10 @@
 # instead of passing them as command-line arguments.
 #
 # Usage:
-#   $ lm_eval --config configs/default_config.yaml
+#   $ lm_eval --config templates/example_ci_config.yaml
 #
 # You can override any values in this config with command-line arguments:
-#   $ lm_eval --config configs/default_config.yaml --model_args pretrained=gpt2 --tasks mmlu
+#   $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
 #
 # All parameters are optional and have the same meaning as their CLI counterparts.
@@ -17,9 +17,18 @@ model_args:
  dtype: float16
 tasks:
  - hellaswag
-  - gsm8k
+  - arc_easy
 batch_size: 1
+device: mps
 trust_remote_code: true
 log_samples: true
 output_path: ./test
-limit: 10
+gen_kwargs:
+  do_sample: true
+  temperature: 0.7
+samples:
+  hellaswag: [1,2,3,4,5,6,7,8,9,10]
+  arc_easy: [10,20,30,40,50,60,70,80,90,100]
+metadata:
+  name: Example CI Config
+  description: This is an example configuration file for testing purposes.
--- a/tests/models/test_neuralmagic.py
+++ b/tests/models/test_neuralmagic.py
-import pytest
-from lm_eval import evaluator
-from lm_eval.api.registry import get_model
-SPARSEML_MODELS_TASKS = [
-    # loglikelihood
-    ("facebook/opt-125m", "lambada_openai"),
-    # loglikelihood_rolling
-    ("hf-internal-testing/tiny-random-gpt2", "wikitext"),
-    # generate_until
-    ("mgoin/tiny-random-llama-2-quant", "gsm8k"),
-]
-DEEPSPARSE_MODELS_TASKS = [
-    # loglikelihood
-    ("hf:mgoin/llama2.c-stories15M-quant-ds", "lambada_openai"),
-    # loglikelihood_rolling (not supported yet)
-    # ("hf:mgoin/llama2.c-stories15M-quant-ds", "wikitext"),
-    # generate_until
-    ("hf:mgoin/llama2.c-stories15M-quant-ds", "gsm8k"),
-]
-@pytest.mark.skip(reason="test failing")
-@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
-def test_sparseml_eval(model_id, task):
-    lm = get_model("sparseml").create_from_arg_string(
-        f"pretrained={model_id}",
-        {
-            "batch_size": 1,
-            "device": "cpu",
-            "dtype": "float32",
-        },
-    )
-    limit = 5
-    evaluator.simple_evaluate(
-        model=lm,
-        tasks=[task],
-        num_fewshot=0,
-        limit=limit,
-    )
-@pytest.mark.parametrize("model_id,task", DEEPSPARSE_MODELS_TASKS)
-def test_deepsparse_eval(model_id, task):
-    lm = get_model("deepsparse").create_from_arg_string(
-        f"pretrained={model_id}",
-        {
-            "batch_size": 1,
-        },
-    )
-    limit = 5
-    evaluator.simple_evaluate(
-        model=lm,
-        tasks=[task],
-        num_fewshot=0,
-        limit=limit,
-    )
--- a/tests/scripts/test_zeno_visualize.py
+++ b/tests/scripts/test_zeno_visualize.py
+import json
+import re
+import pytest
+from scripts.zeno_visualize import sanitize_string
+@pytest.skip("requires zeno_client dependency")
+def test_zeno_sanitize_string():
+    """
+    Test that the model_args handling logic in zeno_visualize.py properly handles
+    different model_args formats (string and dictionary).
+    """
+    # Define the process_model_args function that replicates the fixed logic in zeno_visualize.py
+    # Test case 1: model_args as a string
+    string_model_args = "pretrained=EleutherAI/pythia-160m,dtype=float32"
+    result_string = sanitize_string(string_model_args)
+    expected_string = re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", string_model_args)
+    # Test case 2: model_args as a dictionary
+    dict_model_args = {"pretrained": "EleutherAI/pythia-160m", "dtype": "float32"}
+    result_dict = sanitize_string(dict_model_args)
+    expected_dict = re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", json.dumps(dict_model_args))
+    # Verify the results
+    assert result_string == expected_string
+    assert result_dict == expected_dict
+    # Also test that the sanitization works as expected
+    assert ":" not in result_string  # No colons in sanitized output
+    assert ":" not in result_dict  # No colons in sanitized output
+    assert "/" not in result_dict  # No slashes in sanitized output
+    assert "<" not in result_dict  # No angle brackets in sanitized output
+if __name__ == "__main__":
+    test_zeno_sanitize_string()
+    print("All tests passed.")
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
+import unittest.mock as mock
+from lm_eval.api.metrics import _bootstrap_internal_no_mp, mean
 from lm_eval.api.task import ConfigurableTask, TaskConfig
@@ -149,8 +152,34 @@ def test_acc_mutual_info_without_metric():
    assert result_dict["acc"] == 1.0
+def test_bootstrap_internal_no_mp():
+    """Test basic functionality of _bootstrap_internal_no_mp"""
+    data = [1, 2, 3, 4, 5]
+    # Mock tqdm to avoid progress bar output during testing
+    with mock.patch("tqdm.tqdm") as mock_tqdm:
+        mock_tqdm.return_value = range(1)  # Single chunk
+        # Mock print to avoid output during testing
+        with mock.patch("builtins.print"):
+            result = _bootstrap_internal_no_mp(mean, data, 100)
+    # Should return 100 bootstrap replicates
+    assert len(result) == 100
+    # All results should be numbers (means)
+    assert all(isinstance(x, (int, float)) for x in result)
+    # Bootstrap means should be close to original mean
+    bootstrap_mean = mean(result)
+    original_mean = mean(data)
+    assert abs(bootstrap_mean - original_mean) < 0.5  # Should be reasonably close
 if __name__ == "__main__":
    test_acc_mutual_info_slicing()
    test_acc_mutual_info_different_predictions()
    test_acc_mutual_info_without_metric()
+    test_bootstrap_internal_no_mp()
    print("All tests passed!")
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -46,7 +46,6 @@ def limit() -> int:
    return 10
-# Tests
 class BaseTasks:
    """
    Base class for testing tasks
@@ -166,45 +165,3 @@ class TestNewTasksElseDefault(BaseTasks):
    Test class parameterized with a list of new/modified tasks
    (or a set of default tasks if none have been modified)
    """
-@pytest.mark.parametrize(
-    "task_class",
-    task_class(
-        ["arc_easy_unitxt"], tasks.TaskManager(include_path="./tests/testconfigs")
-    ),
-    ids=lambda x: f"{x.config.task}",
-)
-class TestUnitxtTasks(BaseTasks):
-    """
-    Test class for Unitxt tasks parameterized with a small custom
-    task as described here:
-      https://www.unitxt.ai/en/latest/docs/lm_eval.html
-    """
-    def test_check_training_docs(self, task_class: ConfigurableTask):
-        if task_class.has_training_docs():
-            assert task_class.dataset["train"] is not None
-    def test_check_validation_docs(self, task_class):
-        if task_class.has_validation_docs():
-            assert task_class.dataset["validation"] is not None
-    def test_check_test_docs(self, task_class):
-        task = task_class
-        if task.has_test_docs():
-            assert task.dataset["test"] is not None
-    def test_doc_to_text(self, task_class, limit: int):
-        task = task_class
-        arr = (
-            list(islice(task.test_docs(), limit))
-            if task.has_test_docs()
-            else list(islice(task.validation_docs(), limit))
-        )
-        _array = [task.doc_to_text(doc) for doc in arr]
-        if not task.multiple_input:
-            for x in _array:
-                assert isinstance(x, str)
-        else:
-            pass
--- a/tests/test_unitxt_tasks.py
+++ b/tests/test_unitxt_tasks.py
+from itertools import islice
+import pytest
+from lm_eval import tasks as tasks
+from lm_eval.api.task import ConfigurableTask
+from tests.test_tasks import BaseTasks, task_class
+@pytest.mark.parametrize(
+    "task_class",
+    task_class(
+        ["arc_easy_unitxt"], tasks.TaskManager(include_path="./tests/testconfigs")
+    ),
+    ids=lambda x: f"{x.config.task}",
+)
+class TestUnitxtTasks(BaseTasks):
+    """
+    Test class for Unitxt tasks parameterized with a small custom
+    task as described here:
+      https://www.unitxt.ai/en/latest/docs/lm_eval.html
+    """
+    def test_check_training_docs(self, task_class: ConfigurableTask):
+        if task_class.has_training_docs():
+            assert task_class.dataset["train"] is not None
+    def test_check_validation_docs(self, task_class):
+        if task_class.has_validation_docs():
+            assert task_class.dataset["validation"] is not None
+    def test_check_test_docs(self, task_class):
+        task = task_class
+        if task.has_test_docs():
+            assert task.dataset["test"] is not None
+    def test_doc_to_text(self, task_class, limit: int):
+        task = task_class
+        arr = (
+            list(islice(task.test_docs(), limit))
+            if task.has_test_docs()
+            else list(islice(task.validation_docs(), limit))
+        )
+        _array = [task.doc_to_text(doc) for doc in arr]
+        if not task.multiple_input:
+            for x in _array:
+                assert isinstance(x, str)
+        else:
+            pass