Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
84d02f77
Commit
84d02f77
authored
Jul 10, 2025
by
Baber
Browse files
Merge branch 'main' into feature/eval_from_config
parents
15ce554c
fcddf195
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
159 additions
and
168 deletions
+159
-168
mypy.ini
mypy.ini
+0
-29
pyproject.toml
pyproject.toml
+3
-25
scripts/zeno_visualize.py
scripts/zeno_visualize.py
+25
-5
templates/example_ci_config.yaml
templates/example_ci_config.yaml
+13
-4
tests/models/test_neuralmagic.py
tests/models/test_neuralmagic.py
+0
-62
tests/scripts/test_zeno_visualize.py
tests/scripts/test_zeno_visualize.py
+40
-0
tests/test_metrics.py
tests/test_metrics.py
+29
-0
tests/test_tasks.py
tests/test_tasks.py
+0
-43
tests/test_unitxt_tasks.py
tests/test_unitxt_tasks.py
+49
-0
No files found.
mypy.ini
deleted
100644 → 0
View file @
15ce554c
[mypy]
python_version
=
3.8
show_traceback
=
True
check_untyped_defs
=
True
no_implicit_reexport
=
True
warn_unreachable
=
True
warn_unused_configs
=
True
warn_unused_ignores
=
True
warn_redundant_casts
=
True
# We ignore errors everywhere to gradually add type annotations
[mypy-lm_eval.*]
ignore_errors
=
True
[mypy-lm_eval.api.*]
ignore_errors
=
True
[mypy-lm_eval.prompts.*]
ignore_errors
=
True
[mypy-lm_eval.models.*]
ignore_errors
=
True
[mypy-scripts.*]
ignore_errors
=
True
[mypy-main]
ignore_errors
=
True
pyproject.toml
View file @
84d02f77
...
...
@@ -60,8 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
acpbench
=
[
"lark>=1.1.9"
,
"tarski[clingo]==0.8.2"
,
"pddl==0.4.2"
,
"kstar-planner==1.4.2"
]
api
=
[
"requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken"
]
audiolm_qwen
=
[
"librosa"
,
"soundfile"
]
deepsparse
=
["deepsparse-nightly[llm]>=1.8.0.20240404"]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"mypy"
,
"unitxt==1.22.0"
,
"requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken"
,
"sentencepiece"
]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken"
,
"sentencepiece"
]
gptq
=
["auto-gptq[triton]>=0.6.0"]
gptqmodel
=
["gptqmodel>=1.0.9"]
hf_transfer
=
["hf_transfer"]
...
...
@@ -79,41 +78,20 @@ promptsource = ["promptsource>=0.2.3"]
ruler
=
[
"nltk"
,
"wonderwords"
,
"scipy"
]
sae_lens
=
["sae_lens"]
sentencepiece
=
["sentencepiece>=0.1.98"]
sparseml
=
["sparseml-nightly[llm]>=1.8.0.20240404"]
sparsify
=
["sparsify"]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
unitxt
=
["unitxt==1.22.0"]
vllm
=
["vllm>=0.4.2"]
wandb
=
[
"wandb>=0.16.3"
,
"pandas"
,
"numpy"
]
zeno
=
[
"pandas"
,
"zeno-client"
]
all
=
[
tasks
=
[
"lm_eval[acpbench]"
,
"lm_eval[api]"
,
"lm_eval[audiolm_qwen]"
,
"lm_eval[deepsparse]"
,
"lm_eval[dev]"
,
"lm_eval[gptq]"
,
"lm_eval[gptqmodel]"
,
"lm_eval[hf_transfer]"
,
"lm_eval[ibm_watsonx_ai]"
,
"lm_eval[ifeval]"
,
"lm_eval[ipex]"
,
"lm_eval[japanese_leaderboard]"
,
"lm_eval[longbench]"
,
"lm_eval[mamba]"
,
"lm_eval[math]"
,
"lm_eval[multilingual]"
,
"lm_eval[neuronx]"
,
"lm_eval[optimum]"
,
"lm_eval[promptsource]"
,
"lm_eval[ruler]"
,
"lm_eval[sae_lens]"
,
"lm_eval[sentencepiece]"
,
"lm_eval[sparseml]"
,
"lm_eval[sparsify]"
,
"lm_eval[testing]"
,
"lm_eval[vllm]"
,
"lm_eval[wandb]"
,
"lm_eval[zeno]"
,
]
[tool.pymarkdown]
...
...
scripts/zeno_visualize.py
View file @
84d02f77
...
...
@@ -4,6 +4,7 @@ import logging
import
os
import
re
from
pathlib
import
Path
from
typing
import
Union
import
pandas
as
pd
from
zeno_client
import
ZenoClient
,
ZenoMetric
...
...
@@ -35,6 +36,22 @@ def parse_args():
return
parser
.
parse_args
()
def
sanitize_string
(
model_args_raw
:
Union
[
str
,
dict
])
->
str
:
"""Sanitize the model_args string or dict"""
# Convert to string if it's a dictionary
model_args_str
=
(
json
.
dumps
(
model_args_raw
)
if
isinstance
(
model_args_raw
,
dict
)
else
model_args_raw
)
# Apply the sanitization
return
re
.
sub
(
r
"[\"<>:/|\\?*\[\]]+"
,
"__"
,
model_args_str
,
)
def
main
():
"""Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
...
...
@@ -87,13 +104,16 @@ def main():
latest_sample_results
=
get_latest_filename
(
[
Path
(
f
).
name
for
f
in
model_sample_filenames
if
task
in
f
]
)
model_args
=
re
.
sub
(
r
"[\"<>:/\|\\?\*\[\]]+"
,
"__"
,
# Load the model_args, which can be either a string or a dictionary
model_args
=
sanitize_string
(
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
latest_results
),
encoding
=
"utf-8"
)
)[
"config"
][
"model_args"
],
open
(
Path
(
args
.
data_path
,
model
,
latest_results
),
encoding
=
"utf-8"
,
)
)[
"config"
][
"model_args"
]
)
print
(
model_args
)
data
=
[]
with
open
(
...
...
templates/example_ci_config.yaml
View file @
84d02f77
...
...
@@ -4,10 +4,10 @@
# instead of passing them as command-line arguments.
#
# Usage:
# $ lm_eval --config
configs/default
_config.yaml
# $ lm_eval --config
templates/example_ci
_config.yaml
#
# You can override any values in this config with command-line arguments:
# $ lm_eval --config
configs/default
_config.yaml --model_args pretrained=gpt2 --tasks mmlu
# $ lm_eval --config
templates/example_ci
_config.yaml --model_args pretrained=gpt2 --tasks mmlu
#
# All parameters are optional and have the same meaning as their CLI counterparts.
...
...
@@ -17,9 +17,18 @@ model_args:
dtype
:
float16
tasks
:
-
hellaswag
-
gsm8k
-
arc_easy
batch_size
:
1
device
:
mps
trust_remote_code
:
true
log_samples
:
true
output_path
:
./test
limit
:
10
gen_kwargs
:
do_sample
:
true
temperature
:
0.7
samples
:
hellaswag
:
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
]
arc_easy
:
[
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
]
metadata
:
name
:
Example CI Config
description
:
This is an example configuration file for testing purposes.
tests/models/test_neuralmagic.py
deleted
100644 → 0
View file @
15ce554c
import
pytest
from
lm_eval
import
evaluator
from
lm_eval.api.registry
import
get_model
SPARSEML_MODELS_TASKS
=
[
# loglikelihood
(
"facebook/opt-125m"
,
"lambada_openai"
),
# loglikelihood_rolling
(
"hf-internal-testing/tiny-random-gpt2"
,
"wikitext"
),
# generate_until
(
"mgoin/tiny-random-llama-2-quant"
,
"gsm8k"
),
]
DEEPSPARSE_MODELS_TASKS
=
[
# loglikelihood
(
"hf:mgoin/llama2.c-stories15M-quant-ds"
,
"lambada_openai"
),
# loglikelihood_rolling (not supported yet)
# ("hf:mgoin/llama2.c-stories15M-quant-ds", "wikitext"),
# generate_until
(
"hf:mgoin/llama2.c-stories15M-quant-ds"
,
"gsm8k"
),
]
@
pytest
.
mark
.
skip
(
reason
=
"test failing"
)
@
pytest
.
mark
.
parametrize
(
"model_id,task"
,
SPARSEML_MODELS_TASKS
)
def
test_sparseml_eval
(
model_id
,
task
):
lm
=
get_model
(
"sparseml"
).
create_from_arg_string
(
f
"pretrained=
{
model_id
}
"
,
{
"batch_size"
:
1
,
"device"
:
"cpu"
,
"dtype"
:
"float32"
,
},
)
limit
=
5
evaluator
.
simple_evaluate
(
model
=
lm
,
tasks
=
[
task
],
num_fewshot
=
0
,
limit
=
limit
,
)
@
pytest
.
mark
.
parametrize
(
"model_id,task"
,
DEEPSPARSE_MODELS_TASKS
)
def
test_deepsparse_eval
(
model_id
,
task
):
lm
=
get_model
(
"deepsparse"
).
create_from_arg_string
(
f
"pretrained=
{
model_id
}
"
,
{
"batch_size"
:
1
,
},
)
limit
=
5
evaluator
.
simple_evaluate
(
model
=
lm
,
tasks
=
[
task
],
num_fewshot
=
0
,
limit
=
limit
,
)
tests/scripts/test_zeno_visualize.py
0 → 100644
View file @
84d02f77
import
json
import
re
import
pytest
from
scripts.zeno_visualize
import
sanitize_string
@
pytest
.
skip
(
"requires zeno_client dependency"
)
def
test_zeno_sanitize_string
():
"""
Test that the model_args handling logic in zeno_visualize.py properly handles
different model_args formats (string and dictionary).
"""
# Define the process_model_args function that replicates the fixed logic in zeno_visualize.py
# Test case 1: model_args as a string
string_model_args
=
"pretrained=EleutherAI/pythia-160m,dtype=float32"
result_string
=
sanitize_string
(
string_model_args
)
expected_string
=
re
.
sub
(
r
"[\"<>:/\|\\?\*\[\]]+"
,
"__"
,
string_model_args
)
# Test case 2: model_args as a dictionary
dict_model_args
=
{
"pretrained"
:
"EleutherAI/pythia-160m"
,
"dtype"
:
"float32"
}
result_dict
=
sanitize_string
(
dict_model_args
)
expected_dict
=
re
.
sub
(
r
"[\"<>:/\|\\?\*\[\]]+"
,
"__"
,
json
.
dumps
(
dict_model_args
))
# Verify the results
assert
result_string
==
expected_string
assert
result_dict
==
expected_dict
# Also test that the sanitization works as expected
assert
":"
not
in
result_string
# No colons in sanitized output
assert
":"
not
in
result_dict
# No colons in sanitized output
assert
"/"
not
in
result_dict
# No slashes in sanitized output
assert
"<"
not
in
result_dict
# No angle brackets in sanitized output
if
__name__
==
"__main__"
:
test_zeno_sanitize_string
()
print
(
"All tests passed."
)
tests/test_metrics.py
View file @
84d02f77
import
unittest.mock
as
mock
from
lm_eval.api.metrics
import
_bootstrap_internal_no_mp
,
mean
from
lm_eval.api.task
import
ConfigurableTask
,
TaskConfig
...
...
@@ -149,8 +152,34 @@ def test_acc_mutual_info_without_metric():
assert
result_dict
[
"acc"
]
==
1.0
def
test_bootstrap_internal_no_mp
():
"""Test basic functionality of _bootstrap_internal_no_mp"""
data
=
[
1
,
2
,
3
,
4
,
5
]
# Mock tqdm to avoid progress bar output during testing
with
mock
.
patch
(
"tqdm.tqdm"
)
as
mock_tqdm
:
mock_tqdm
.
return_value
=
range
(
1
)
# Single chunk
# Mock print to avoid output during testing
with
mock
.
patch
(
"builtins.print"
):
result
=
_bootstrap_internal_no_mp
(
mean
,
data
,
100
)
# Should return 100 bootstrap replicates
assert
len
(
result
)
==
100
# All results should be numbers (means)
assert
all
(
isinstance
(
x
,
(
int
,
float
))
for
x
in
result
)
# Bootstrap means should be close to original mean
bootstrap_mean
=
mean
(
result
)
original_mean
=
mean
(
data
)
assert
abs
(
bootstrap_mean
-
original_mean
)
<
0.5
# Should be reasonably close
if
__name__
==
"__main__"
:
test_acc_mutual_info_slicing
()
test_acc_mutual_info_different_predictions
()
test_acc_mutual_info_without_metric
()
test_bootstrap_internal_no_mp
()
print
(
"All tests passed!"
)
tests/test_tasks.py
View file @
84d02f77
...
...
@@ -46,7 +46,6 @@ def limit() -> int:
return
10
# Tests
class
BaseTasks
:
"""
Base class for testing tasks
...
...
@@ -166,45 +165,3 @@ class TestNewTasksElseDefault(BaseTasks):
Test class parameterized with a list of new/modified tasks
(or a set of default tasks if none have been modified)
"""
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
(
[
"arc_easy_unitxt"
],
tasks
.
TaskManager
(
include_path
=
"./tests/testconfigs"
)
),
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
,
)
class
TestUnitxtTasks
(
BaseTasks
):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def
test_check_training_docs
(
self
,
task_class
:
ConfigurableTask
):
if
task_class
.
has_training_docs
():
assert
task_class
.
dataset
[
"train"
]
is
not
None
def
test_check_validation_docs
(
self
,
task_class
):
if
task_class
.
has_validation_docs
():
assert
task_class
.
dataset
[
"validation"
]
is
not
None
def
test_check_test_docs
(
self
,
task_class
):
task
=
task_class
if
task
.
has_test_docs
():
assert
task
.
dataset
[
"test"
]
is
not
None
def
test_doc_to_text
(
self
,
task_class
,
limit
:
int
):
task
=
task_class
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
if
not
task
.
multiple_input
:
for
x
in
_array
:
assert
isinstance
(
x
,
str
)
else
:
pass
tests/test_unitxt_tasks.py
0 → 100644
View file @
84d02f77
from
itertools
import
islice
import
pytest
from
lm_eval
import
tasks
as
tasks
from
lm_eval.api.task
import
ConfigurableTask
from
tests.test_tasks
import
BaseTasks
,
task_class
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
(
[
"arc_easy_unitxt"
],
tasks
.
TaskManager
(
include_path
=
"./tests/testconfigs"
)
),
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
,
)
class
TestUnitxtTasks
(
BaseTasks
):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def
test_check_training_docs
(
self
,
task_class
:
ConfigurableTask
):
if
task_class
.
has_training_docs
():
assert
task_class
.
dataset
[
"train"
]
is
not
None
def
test_check_validation_docs
(
self
,
task_class
):
if
task_class
.
has_validation_docs
():
assert
task_class
.
dataset
[
"validation"
]
is
not
None
def
test_check_test_docs
(
self
,
task_class
):
task
=
task_class
if
task
.
has_test_docs
():
assert
task
.
dataset
[
"test"
]
is
not
None
def
test_doc_to_text
(
self
,
task_class
,
limit
:
int
):
task
=
task_class
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
if
not
task
.
multiple_input
:
for
x
in
_array
:
assert
isinstance
(
x
,
str
)
else
:
pass
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment