Commit 84d02f77 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into feature/eval_from_config

parents 15ce554c fcddf195
[mypy]
python_version = 3.8
show_traceback = True
check_untyped_defs = True
no_implicit_reexport = True
warn_unreachable = True
warn_unused_configs = True
warn_unused_ignores = True
warn_redundant_casts = True
# We ignore errors everywhere to gradually add type annotations
[mypy-lm_eval.*]
ignore_errors = True
[mypy-lm_eval.api.*]
ignore_errors = True
[mypy-lm_eval.prompts.*]
ignore_errors = True
[mypy-lm_eval.models.*]
ignore_errors = True
[mypy-scripts.*]
ignore_errors = True
[mypy-main]
ignore_errors = True
...@@ -60,8 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness" ...@@ -60,8 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
acpbench = ["lark>=1.1.9", "tarski[clingo]==0.8.2", "pddl==0.4.2", "kstar-planner==1.4.2"] acpbench = ["lark>=1.1.9", "tarski[clingo]==0.8.2", "pddl==0.4.2", "kstar-planner==1.4.2"]
api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"] api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
audiolm_qwen = ["librosa", "soundfile"] audiolm_qwen = ["librosa", "soundfile"]
deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"] dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt==1.22.0", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
gptq = ["auto-gptq[triton]>=0.6.0"] gptq = ["auto-gptq[triton]>=0.6.0"]
gptqmodel = ["gptqmodel>=1.0.9"] gptqmodel = ["gptqmodel>=1.0.9"]
hf_transfer = ["hf_transfer"] hf_transfer = ["hf_transfer"]
...@@ -79,41 +78,20 @@ promptsource = ["promptsource>=0.2.3"] ...@@ -79,41 +78,20 @@ promptsource = ["promptsource>=0.2.3"]
ruler = ["nltk", "wonderwords", "scipy"] ruler = ["nltk", "wonderwords", "scipy"]
sae_lens = ["sae_lens"] sae_lens = ["sae_lens"]
sentencepiece = ["sentencepiece>=0.1.98"] sentencepiece = ["sentencepiece>=0.1.98"]
sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
sparsify = ["sparsify"] sparsify = ["sparsify"]
testing = ["pytest", "pytest-cov", "pytest-xdist"] testing = ["pytest", "pytest-cov", "pytest-xdist"]
unitxt = ["unitxt==1.22.0"]
vllm = ["vllm>=0.4.2"] vllm = ["vllm>=0.4.2"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"] wandb = ["wandb>=0.16.3", "pandas", "numpy"]
zeno = ["pandas", "zeno-client"] zeno = ["pandas", "zeno-client"]
all = [ tasks = [
"lm_eval[acpbench]", "lm_eval[acpbench]",
"lm_eval[api]",
"lm_eval[audiolm_qwen]",
"lm_eval[deepsparse]",
"lm_eval[dev]",
"lm_eval[gptq]",
"lm_eval[gptqmodel]",
"lm_eval[hf_transfer]",
"lm_eval[ibm_watsonx_ai]",
"lm_eval[ifeval]", "lm_eval[ifeval]",
"lm_eval[ipex]",
"lm_eval[japanese_leaderboard]", "lm_eval[japanese_leaderboard]",
"lm_eval[longbench]", "lm_eval[longbench]",
"lm_eval[mamba]",
"lm_eval[math]", "lm_eval[math]",
"lm_eval[multilingual]", "lm_eval[multilingual]",
"lm_eval[neuronx]",
"lm_eval[optimum]",
"lm_eval[promptsource]",
"lm_eval[ruler]", "lm_eval[ruler]",
"lm_eval[sae_lens]",
"lm_eval[sentencepiece]",
"lm_eval[sparseml]",
"lm_eval[sparsify]",
"lm_eval[testing]",
"lm_eval[vllm]",
"lm_eval[wandb]",
"lm_eval[zeno]",
] ]
[tool.pymarkdown] [tool.pymarkdown]
......
...@@ -4,6 +4,7 @@ import logging ...@@ -4,6 +4,7 @@ import logging
import os import os
import re import re
from pathlib import Path from pathlib import Path
from typing import Union
import pandas as pd import pandas as pd
from zeno_client import ZenoClient, ZenoMetric from zeno_client import ZenoClient, ZenoMetric
...@@ -35,6 +36,22 @@ def parse_args(): ...@@ -35,6 +36,22 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
def sanitize_string(model_args_raw: Union[str, dict]) -> str:
"""Sanitize the model_args string or dict"""
# Convert to string if it's a dictionary
model_args_str = (
json.dumps(model_args_raw)
if isinstance(model_args_raw, dict)
else model_args_raw
)
# Apply the sanitization
return re.sub(
r"[\"<>:/|\\?*\[\]]+",
"__",
model_args_str,
)
def main(): def main():
"""Upload the results of your benchmark tasks to the Zeno AI evaluation platform. """Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
...@@ -87,13 +104,16 @@ def main(): ...@@ -87,13 +104,16 @@ def main():
latest_sample_results = get_latest_filename( latest_sample_results = get_latest_filename(
[Path(f).name for f in model_sample_filenames if task in f] [Path(f).name for f in model_sample_filenames if task in f]
) )
model_args = re.sub( # Load the model_args, which can be either a string or a dictionary
r"[\"<>:/\|\\?\*\[\]]+", model_args = sanitize_string(
"__",
json.load( json.load(
open(Path(args.data_path, model, latest_results), encoding="utf-8") open(
)["config"]["model_args"], Path(args.data_path, model, latest_results),
encoding="utf-8",
)
)["config"]["model_args"]
) )
print(model_args) print(model_args)
data = [] data = []
with open( with open(
......
...@@ -4,10 +4,10 @@ ...@@ -4,10 +4,10 @@
# instead of passing them as command-line arguments. # instead of passing them as command-line arguments.
# #
# Usage: # Usage:
# $ lm_eval --config configs/default_config.yaml # $ lm_eval --config templates/example_ci_config.yaml
# #
# You can override any values in this config with command-line arguments: # You can override any values in this config with command-line arguments:
# $ lm_eval --config configs/default_config.yaml --model_args pretrained=gpt2 --tasks mmlu # $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
# #
# All parameters are optional and have the same meaning as their CLI counterparts. # All parameters are optional and have the same meaning as their CLI counterparts.
...@@ -17,9 +17,18 @@ model_args: ...@@ -17,9 +17,18 @@ model_args:
dtype: float16 dtype: float16
tasks: tasks:
- hellaswag - hellaswag
- gsm8k - arc_easy
batch_size: 1 batch_size: 1
device: mps
trust_remote_code: true trust_remote_code: true
log_samples: true log_samples: true
output_path: ./test output_path: ./test
limit: 10 gen_kwargs:
do_sample: true
temperature: 0.7
samples:
hellaswag: [1,2,3,4,5,6,7,8,9,10]
arc_easy: [10,20,30,40,50,60,70,80,90,100]
metadata:
name: Example CI Config
description: This is an example configuration file for testing purposes.
import pytest
from lm_eval import evaluator
from lm_eval.api.registry import get_model
SPARSEML_MODELS_TASKS = [
# loglikelihood
("facebook/opt-125m", "lambada_openai"),
# loglikelihood_rolling
("hf-internal-testing/tiny-random-gpt2", "wikitext"),
# generate_until
("mgoin/tiny-random-llama-2-quant", "gsm8k"),
]
DEEPSPARSE_MODELS_TASKS = [
# loglikelihood
("hf:mgoin/llama2.c-stories15M-quant-ds", "lambada_openai"),
# loglikelihood_rolling (not supported yet)
# ("hf:mgoin/llama2.c-stories15M-quant-ds", "wikitext"),
# generate_until
("hf:mgoin/llama2.c-stories15M-quant-ds", "gsm8k"),
]
@pytest.mark.skip(reason="test failing")
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string(
f"pretrained={model_id}",
{
"batch_size": 1,
"device": "cpu",
"dtype": "float32",
},
)
limit = 5
evaluator.simple_evaluate(
model=lm,
tasks=[task],
num_fewshot=0,
limit=limit,
)
@pytest.mark.parametrize("model_id,task", DEEPSPARSE_MODELS_TASKS)
def test_deepsparse_eval(model_id, task):
lm = get_model("deepsparse").create_from_arg_string(
f"pretrained={model_id}",
{
"batch_size": 1,
},
)
limit = 5
evaluator.simple_evaluate(
model=lm,
tasks=[task],
num_fewshot=0,
limit=limit,
)
import json
import re
import pytest
from scripts.zeno_visualize import sanitize_string
@pytest.skip("requires zeno_client dependency")
def test_zeno_sanitize_string():
"""
Test that the model_args handling logic in zeno_visualize.py properly handles
different model_args formats (string and dictionary).
"""
# Define the process_model_args function that replicates the fixed logic in zeno_visualize.py
# Test case 1: model_args as a string
string_model_args = "pretrained=EleutherAI/pythia-160m,dtype=float32"
result_string = sanitize_string(string_model_args)
expected_string = re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", string_model_args)
# Test case 2: model_args as a dictionary
dict_model_args = {"pretrained": "EleutherAI/pythia-160m", "dtype": "float32"}
result_dict = sanitize_string(dict_model_args)
expected_dict = re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", json.dumps(dict_model_args))
# Verify the results
assert result_string == expected_string
assert result_dict == expected_dict
# Also test that the sanitization works as expected
assert ":" not in result_string # No colons in sanitized output
assert ":" not in result_dict # No colons in sanitized output
assert "/" not in result_dict # No slashes in sanitized output
assert "<" not in result_dict # No angle brackets in sanitized output
if __name__ == "__main__":
test_zeno_sanitize_string()
print("All tests passed.")
import unittest.mock as mock
from lm_eval.api.metrics import _bootstrap_internal_no_mp, mean
from lm_eval.api.task import ConfigurableTask, TaskConfig from lm_eval.api.task import ConfigurableTask, TaskConfig
...@@ -149,8 +152,34 @@ def test_acc_mutual_info_without_metric(): ...@@ -149,8 +152,34 @@ def test_acc_mutual_info_without_metric():
assert result_dict["acc"] == 1.0 assert result_dict["acc"] == 1.0
def test_bootstrap_internal_no_mp():
"""Test basic functionality of _bootstrap_internal_no_mp"""
data = [1, 2, 3, 4, 5]
# Mock tqdm to avoid progress bar output during testing
with mock.patch("tqdm.tqdm") as mock_tqdm:
mock_tqdm.return_value = range(1) # Single chunk
# Mock print to avoid output during testing
with mock.patch("builtins.print"):
result = _bootstrap_internal_no_mp(mean, data, 100)
# Should return 100 bootstrap replicates
assert len(result) == 100
# All results should be numbers (means)
assert all(isinstance(x, (int, float)) for x in result)
# Bootstrap means should be close to original mean
bootstrap_mean = mean(result)
original_mean = mean(data)
assert abs(bootstrap_mean - original_mean) < 0.5 # Should be reasonably close
if __name__ == "__main__": if __name__ == "__main__":
test_acc_mutual_info_slicing() test_acc_mutual_info_slicing()
test_acc_mutual_info_different_predictions() test_acc_mutual_info_different_predictions()
test_acc_mutual_info_without_metric() test_acc_mutual_info_without_metric()
test_bootstrap_internal_no_mp()
print("All tests passed!") print("All tests passed!")
...@@ -46,7 +46,6 @@ def limit() -> int: ...@@ -46,7 +46,6 @@ def limit() -> int:
return 10 return 10
# Tests
class BaseTasks: class BaseTasks:
""" """
Base class for testing tasks Base class for testing tasks
...@@ -166,45 +165,3 @@ class TestNewTasksElseDefault(BaseTasks): ...@@ -166,45 +165,3 @@ class TestNewTasksElseDefault(BaseTasks):
Test class parameterized with a list of new/modified tasks Test class parameterized with a list of new/modified tasks
(or a set of default tasks if none have been modified) (or a set of default tasks if none have been modified)
""" """
@pytest.mark.parametrize(
"task_class",
task_class(
["arc_easy_unitxt"], tasks.TaskManager(include_path="./tests/testconfigs")
),
ids=lambda x: f"{x.config.task}",
)
class TestUnitxtTasks(BaseTasks):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def test_check_training_docs(self, task_class: ConfigurableTask):
if task_class.has_training_docs():
assert task_class.dataset["train"] is not None
def test_check_validation_docs(self, task_class):
if task_class.has_validation_docs():
assert task_class.dataset["validation"] is not None
def test_check_test_docs(self, task_class):
task = task_class
if task.has_test_docs():
assert task.dataset["test"] is not None
def test_doc_to_text(self, task_class, limit: int):
task = task_class
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
if not task.multiple_input:
for x in _array:
assert isinstance(x, str)
else:
pass
from itertools import islice
import pytest
from lm_eval import tasks as tasks
from lm_eval.api.task import ConfigurableTask
from tests.test_tasks import BaseTasks, task_class
@pytest.mark.parametrize(
"task_class",
task_class(
["arc_easy_unitxt"], tasks.TaskManager(include_path="./tests/testconfigs")
),
ids=lambda x: f"{x.config.task}",
)
class TestUnitxtTasks(BaseTasks):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def test_check_training_docs(self, task_class: ConfigurableTask):
if task_class.has_training_docs():
assert task_class.dataset["train"] is not None
def test_check_validation_docs(self, task_class):
if task_class.has_validation_docs():
assert task_class.dataset["validation"] is not None
def test_check_test_docs(self, task_class):
task = task_class
if task.has_test_docs():
assert task.dataset["test"] is not None
def test_doc_to_text(self, task_class, limit: int):
task = task_class
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
if not task.multiple_input:
for x in _array:
assert isinstance(x, str)
else:
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment