"examples/vscode:/vscode.git/clone" did not exist on "b7cd743038f58ef6a5b183a912211622a63db359"
Commit 84d02f77 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into feature/eval_from_config

parents 15ce554c fcddf195
[mypy]
python_version = 3.8
show_traceback = True
check_untyped_defs = True
no_implicit_reexport = True
warn_unreachable = True
warn_unused_configs = True
warn_unused_ignores = True
warn_redundant_casts = True
# We ignore errors everywhere to gradually add type annotations
[mypy-lm_eval.*]
ignore_errors = True
[mypy-lm_eval.api.*]
ignore_errors = True
[mypy-lm_eval.prompts.*]
ignore_errors = True
[mypy-lm_eval.models.*]
ignore_errors = True
[mypy-scripts.*]
ignore_errors = True
[mypy-main]
ignore_errors = True
......@@ -60,8 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
acpbench = ["lark>=1.1.9", "tarski[clingo]==0.8.2", "pddl==0.4.2", "kstar-planner==1.4.2"]
api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
audiolm_qwen = ["librosa", "soundfile"]
deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt==1.22.0", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
gptq = ["auto-gptq[triton]>=0.6.0"]
gptqmodel = ["gptqmodel>=1.0.9"]
hf_transfer = ["hf_transfer"]
......@@ -79,41 +78,20 @@ promptsource = ["promptsource>=0.2.3"]
ruler = ["nltk", "wonderwords", "scipy"]
sae_lens = ["sae_lens"]
sentencepiece = ["sentencepiece>=0.1.98"]
sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
sparsify = ["sparsify"]
testing = ["pytest", "pytest-cov", "pytest-xdist"]
unitxt = ["unitxt==1.22.0"]
vllm = ["vllm>=0.4.2"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"]
zeno = ["pandas", "zeno-client"]
all = [
tasks = [
"lm_eval[acpbench]",
"lm_eval[api]",
"lm_eval[audiolm_qwen]",
"lm_eval[deepsparse]",
"lm_eval[dev]",
"lm_eval[gptq]",
"lm_eval[gptqmodel]",
"lm_eval[hf_transfer]",
"lm_eval[ibm_watsonx_ai]",
"lm_eval[ifeval]",
"lm_eval[ipex]",
"lm_eval[japanese_leaderboard]",
"lm_eval[longbench]",
"lm_eval[mamba]",
"lm_eval[math]",
"lm_eval[multilingual]",
"lm_eval[neuronx]",
"lm_eval[optimum]",
"lm_eval[promptsource]",
"lm_eval[ruler]",
"lm_eval[sae_lens]",
"lm_eval[sentencepiece]",
"lm_eval[sparseml]",
"lm_eval[sparsify]",
"lm_eval[testing]",
"lm_eval[vllm]",
"lm_eval[wandb]",
"lm_eval[zeno]",
]
[tool.pymarkdown]
......
......@@ -4,6 +4,7 @@ import logging
import os
import re
from pathlib import Path
from typing import Union
import pandas as pd
from zeno_client import ZenoClient, ZenoMetric
......@@ -35,6 +36,22 @@ def parse_args():
return parser.parse_args()
def sanitize_string(model_args_raw: Union[str, dict]) -> str:
"""Sanitize the model_args string or dict"""
# Convert to string if it's a dictionary
model_args_str = (
json.dumps(model_args_raw)
if isinstance(model_args_raw, dict)
else model_args_raw
)
# Apply the sanitization
return re.sub(
r"[\"<>:/|\\?*\[\]]+",
"__",
model_args_str,
)
def main():
"""Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
......@@ -87,13 +104,16 @@ def main():
latest_sample_results = get_latest_filename(
[Path(f).name for f in model_sample_filenames if task in f]
)
model_args = re.sub(
r"[\"<>:/\|\\?\*\[\]]+",
"__",
# Load the model_args, which can be either a string or a dictionary
model_args = sanitize_string(
json.load(
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["config"]["model_args"],
open(
Path(args.data_path, model, latest_results),
encoding="utf-8",
)
)["config"]["model_args"]
)
print(model_args)
data = []
with open(
......
......@@ -4,10 +4,10 @@
# instead of passing them as command-line arguments.
#
# Usage:
# $ lm_eval --config configs/default_config.yaml
# $ lm_eval --config templates/example_ci_config.yaml
#
# You can override any values in this config with command-line arguments:
# $ lm_eval --config configs/default_config.yaml --model_args pretrained=gpt2 --tasks mmlu
# $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
#
# All parameters are optional and have the same meaning as their CLI counterparts.
......@@ -17,9 +17,18 @@ model_args:
dtype: float16
tasks:
- hellaswag
- gsm8k
- arc_easy
batch_size: 1
device: mps
trust_remote_code: true
log_samples: true
output_path: ./test
limit: 10
gen_kwargs:
do_sample: true
temperature: 0.7
samples:
hellaswag: [1,2,3,4,5,6,7,8,9,10]
arc_easy: [10,20,30,40,50,60,70,80,90,100]
metadata:
name: Example CI Config
description: This is an example configuration file for testing purposes.
import pytest
from lm_eval import evaluator
from lm_eval.api.registry import get_model
SPARSEML_MODELS_TASKS = [
# loglikelihood
("facebook/opt-125m", "lambada_openai"),
# loglikelihood_rolling
("hf-internal-testing/tiny-random-gpt2", "wikitext"),
# generate_until
("mgoin/tiny-random-llama-2-quant", "gsm8k"),
]
DEEPSPARSE_MODELS_TASKS = [
# loglikelihood
("hf:mgoin/llama2.c-stories15M-quant-ds", "lambada_openai"),
# loglikelihood_rolling (not supported yet)
# ("hf:mgoin/llama2.c-stories15M-quant-ds", "wikitext"),
# generate_until
("hf:mgoin/llama2.c-stories15M-quant-ds", "gsm8k"),
]
@pytest.mark.skip(reason="test failing")
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string(
f"pretrained={model_id}",
{
"batch_size": 1,
"device": "cpu",
"dtype": "float32",
},
)
limit = 5
evaluator.simple_evaluate(
model=lm,
tasks=[task],
num_fewshot=0,
limit=limit,
)
@pytest.mark.parametrize("model_id,task", DEEPSPARSE_MODELS_TASKS)
def test_deepsparse_eval(model_id, task):
lm = get_model("deepsparse").create_from_arg_string(
f"pretrained={model_id}",
{
"batch_size": 1,
},
)
limit = 5
evaluator.simple_evaluate(
model=lm,
tasks=[task],
num_fewshot=0,
limit=limit,
)
import json
import re
import pytest
from scripts.zeno_visualize import sanitize_string
@pytest.skip("requires zeno_client dependency")
def test_zeno_sanitize_string():
"""
Test that the model_args handling logic in zeno_visualize.py properly handles
different model_args formats (string and dictionary).
"""
# Define the process_model_args function that replicates the fixed logic in zeno_visualize.py
# Test case 1: model_args as a string
string_model_args = "pretrained=EleutherAI/pythia-160m,dtype=float32"
result_string = sanitize_string(string_model_args)
expected_string = re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", string_model_args)
# Test case 2: model_args as a dictionary
dict_model_args = {"pretrained": "EleutherAI/pythia-160m", "dtype": "float32"}
result_dict = sanitize_string(dict_model_args)
expected_dict = re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", json.dumps(dict_model_args))
# Verify the results
assert result_string == expected_string
assert result_dict == expected_dict
# Also test that the sanitization works as expected
assert ":" not in result_string # No colons in sanitized output
assert ":" not in result_dict # No colons in sanitized output
assert "/" not in result_dict # No slashes in sanitized output
assert "<" not in result_dict # No angle brackets in sanitized output
if __name__ == "__main__":
test_zeno_sanitize_string()
print("All tests passed.")
import unittest.mock as mock
from lm_eval.api.metrics import _bootstrap_internal_no_mp, mean
from lm_eval.api.task import ConfigurableTask, TaskConfig
......@@ -149,8 +152,34 @@ def test_acc_mutual_info_without_metric():
assert result_dict["acc"] == 1.0
def test_bootstrap_internal_no_mp():
"""Test basic functionality of _bootstrap_internal_no_mp"""
data = [1, 2, 3, 4, 5]
# Mock tqdm to avoid progress bar output during testing
with mock.patch("tqdm.tqdm") as mock_tqdm:
mock_tqdm.return_value = range(1) # Single chunk
# Mock print to avoid output during testing
with mock.patch("builtins.print"):
result = _bootstrap_internal_no_mp(mean, data, 100)
# Should return 100 bootstrap replicates
assert len(result) == 100
# All results should be numbers (means)
assert all(isinstance(x, (int, float)) for x in result)
# Bootstrap means should be close to original mean
bootstrap_mean = mean(result)
original_mean = mean(data)
assert abs(bootstrap_mean - original_mean) < 0.5 # Should be reasonably close
if __name__ == "__main__":
test_acc_mutual_info_slicing()
test_acc_mutual_info_different_predictions()
test_acc_mutual_info_without_metric()
test_bootstrap_internal_no_mp()
print("All tests passed!")
......@@ -46,7 +46,6 @@ def limit() -> int:
return 10
# Tests
class BaseTasks:
"""
Base class for testing tasks
......@@ -166,45 +165,3 @@ class TestNewTasksElseDefault(BaseTasks):
Test class parameterized with a list of new/modified tasks
(or a set of default tasks if none have been modified)
"""
@pytest.mark.parametrize(
"task_class",
task_class(
["arc_easy_unitxt"], tasks.TaskManager(include_path="./tests/testconfigs")
),
ids=lambda x: f"{x.config.task}",
)
class TestUnitxtTasks(BaseTasks):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def test_check_training_docs(self, task_class: ConfigurableTask):
if task_class.has_training_docs():
assert task_class.dataset["train"] is not None
def test_check_validation_docs(self, task_class):
if task_class.has_validation_docs():
assert task_class.dataset["validation"] is not None
def test_check_test_docs(self, task_class):
task = task_class
if task.has_test_docs():
assert task.dataset["test"] is not None
def test_doc_to_text(self, task_class, limit: int):
task = task_class
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
if not task.multiple_input:
for x in _array:
assert isinstance(x, str)
else:
pass
from itertools import islice
import pytest
from lm_eval import tasks as tasks
from lm_eval.api.task import ConfigurableTask
from tests.test_tasks import BaseTasks, task_class
@pytest.mark.parametrize(
"task_class",
task_class(
["arc_easy_unitxt"], tasks.TaskManager(include_path="./tests/testconfigs")
),
ids=lambda x: f"{x.config.task}",
)
class TestUnitxtTasks(BaseTasks):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def test_check_training_docs(self, task_class: ConfigurableTask):
if task_class.has_training_docs():
assert task_class.dataset["train"] is not None
def test_check_validation_docs(self, task_class):
if task_class.has_validation_docs():
assert task_class.dataset["validation"] is not None
def test_check_test_docs(self, task_class):
task = task_class
if task.has_test_docs():
assert task.dataset["test"] is not None
def test_doc_to_text(self, task_class, limit: int):
task = task_class
arr = (
list(islice(task.test_docs(), limit))
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
_array = [task.doc_to_text(doc) for doc in arr]
if not task.multiple_input:
for x in _array:
assert isinstance(x, str)
else:
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment