Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
84d02f77
Commit
84d02f77
authored
Jul 10, 2025
by
Baber
Browse files
Merge branch 'main' into feature/eval_from_config
parents
15ce554c
fcddf195
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
159 additions
and
168 deletions
+159
-168
mypy.ini
mypy.ini
+0
-29
pyproject.toml
pyproject.toml
+3
-25
scripts/zeno_visualize.py
scripts/zeno_visualize.py
+25
-5
templates/example_ci_config.yaml
templates/example_ci_config.yaml
+13
-4
tests/models/test_neuralmagic.py
tests/models/test_neuralmagic.py
+0
-62
tests/scripts/test_zeno_visualize.py
tests/scripts/test_zeno_visualize.py
+40
-0
tests/test_metrics.py
tests/test_metrics.py
+29
-0
tests/test_tasks.py
tests/test_tasks.py
+0
-43
tests/test_unitxt_tasks.py
tests/test_unitxt_tasks.py
+49
-0
No files found.
mypy.ini
deleted
100644 → 0
View file @
15ce554c
[mypy]
python_version
=
3.8
show_traceback
=
True
check_untyped_defs
=
True
no_implicit_reexport
=
True
warn_unreachable
=
True
warn_unused_configs
=
True
warn_unused_ignores
=
True
warn_redundant_casts
=
True
# We ignore errors everywhere to gradually add type annotations
[mypy-lm_eval.*]
ignore_errors
=
True
[mypy-lm_eval.api.*]
ignore_errors
=
True
[mypy-lm_eval.prompts.*]
ignore_errors
=
True
[mypy-lm_eval.models.*]
ignore_errors
=
True
[mypy-scripts.*]
ignore_errors
=
True
[mypy-main]
ignore_errors
=
True
pyproject.toml
View file @
84d02f77
...
@@ -60,8 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
...
@@ -60,8 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
acpbench
=
[
"lark>=1.1.9"
,
"tarski[clingo]==0.8.2"
,
"pddl==0.4.2"
,
"kstar-planner==1.4.2"
]
acpbench
=
[
"lark>=1.1.9"
,
"tarski[clingo]==0.8.2"
,
"pddl==0.4.2"
,
"kstar-planner==1.4.2"
]
api
=
[
"requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken"
]
api
=
[
"requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken"
]
audiolm_qwen
=
[
"librosa"
,
"soundfile"
]
audiolm_qwen
=
[
"librosa"
,
"soundfile"
]
deepsparse
=
["deepsparse-nightly[llm]>=1.8.0.20240404"]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken"
,
"sentencepiece"
]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"mypy"
,
"unitxt==1.22.0"
,
"requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken"
,
"sentencepiece"
]
gptq
=
["auto-gptq[triton]>=0.6.0"]
gptq
=
["auto-gptq[triton]>=0.6.0"]
gptqmodel
=
["gptqmodel>=1.0.9"]
gptqmodel
=
["gptqmodel>=1.0.9"]
hf_transfer
=
["hf_transfer"]
hf_transfer
=
["hf_transfer"]
...
@@ -79,41 +78,20 @@ promptsource = ["promptsource>=0.2.3"]
...
@@ -79,41 +78,20 @@ promptsource = ["promptsource>=0.2.3"]
ruler
=
[
"nltk"
,
"wonderwords"
,
"scipy"
]
ruler
=
[
"nltk"
,
"wonderwords"
,
"scipy"
]
sae_lens
=
["sae_lens"]
sae_lens
=
["sae_lens"]
sentencepiece
=
["sentencepiece>=0.1.98"]
sentencepiece
=
["sentencepiece>=0.1.98"]
sparseml
=
["sparseml-nightly[llm]>=1.8.0.20240404"]
sparsify
=
["sparsify"]
sparsify
=
["sparsify"]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
unitxt
=
["unitxt==1.22.0"]
vllm
=
["vllm>=0.4.2"]
vllm
=
["vllm>=0.4.2"]
wandb
=
[
"wandb>=0.16.3"
,
"pandas"
,
"numpy"
]
wandb
=
[
"wandb>=0.16.3"
,
"pandas"
,
"numpy"
]
zeno
=
[
"pandas"
,
"zeno-client"
]
zeno
=
[
"pandas"
,
"zeno-client"
]
all
=
[
tasks
=
[
"lm_eval[acpbench]"
,
"lm_eval[acpbench]"
,
"lm_eval[api]"
,
"lm_eval[audiolm_qwen]"
,
"lm_eval[deepsparse]"
,
"lm_eval[dev]"
,
"lm_eval[gptq]"
,
"lm_eval[gptqmodel]"
,
"lm_eval[hf_transfer]"
,
"lm_eval[ibm_watsonx_ai]"
,
"lm_eval[ifeval]"
,
"lm_eval[ifeval]"
,
"lm_eval[ipex]"
,
"lm_eval[japanese_leaderboard]"
,
"lm_eval[japanese_leaderboard]"
,
"lm_eval[longbench]"
,
"lm_eval[longbench]"
,
"lm_eval[mamba]"
,
"lm_eval[math]"
,
"lm_eval[math]"
,
"lm_eval[multilingual]"
,
"lm_eval[multilingual]"
,
"lm_eval[neuronx]"
,
"lm_eval[optimum]"
,
"lm_eval[promptsource]"
,
"lm_eval[ruler]"
,
"lm_eval[ruler]"
,
"lm_eval[sae_lens]"
,
"lm_eval[sentencepiece]"
,
"lm_eval[sparseml]"
,
"lm_eval[sparsify]"
,
"lm_eval[testing]"
,
"lm_eval[vllm]"
,
"lm_eval[wandb]"
,
"lm_eval[zeno]"
,
]
]
[tool.pymarkdown]
[tool.pymarkdown]
...
...
scripts/zeno_visualize.py
View file @
84d02f77
...
@@ -4,6 +4,7 @@ import logging
...
@@ -4,6 +4,7 @@ import logging
import
os
import
os
import
re
import
re
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Union
import
pandas
as
pd
import
pandas
as
pd
from
zeno_client
import
ZenoClient
,
ZenoMetric
from
zeno_client
import
ZenoClient
,
ZenoMetric
...
@@ -35,6 +36,22 @@ def parse_args():
...
@@ -35,6 +36,22 @@ def parse_args():
return
parser
.
parse_args
()
return
parser
.
parse_args
()
def
sanitize_string
(
model_args_raw
:
Union
[
str
,
dict
])
->
str
:
"""Sanitize the model_args string or dict"""
# Convert to string if it's a dictionary
model_args_str
=
(
json
.
dumps
(
model_args_raw
)
if
isinstance
(
model_args_raw
,
dict
)
else
model_args_raw
)
# Apply the sanitization
return
re
.
sub
(
r
"[\"<>:/|\\?*\[\]]+"
,
"__"
,
model_args_str
,
)
def
main
():
def
main
():
"""Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
"""Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
...
@@ -87,13 +104,16 @@ def main():
...
@@ -87,13 +104,16 @@ def main():
latest_sample_results
=
get_latest_filename
(
latest_sample_results
=
get_latest_filename
(
[
Path
(
f
).
name
for
f
in
model_sample_filenames
if
task
in
f
]
[
Path
(
f
).
name
for
f
in
model_sample_filenames
if
task
in
f
]
)
)
model_args
=
re
.
sub
(
# Load the model_args, which can be either a string or a dictionary
r
"[\"<>:/\|\\?\*\[\]]+"
,
model_args
=
sanitize_string
(
"__"
,
json
.
load
(
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
latest_results
),
encoding
=
"utf-8"
)
open
(
)[
"config"
][
"model_args"
],
Path
(
args
.
data_path
,
model
,
latest_results
),
encoding
=
"utf-8"
,
)
)[
"config"
][
"model_args"
]
)
)
print
(
model_args
)
print
(
model_args
)
data
=
[]
data
=
[]
with
open
(
with
open
(
...
...
templates/example_ci_config.yaml
View file @
84d02f77
...
@@ -4,10 +4,10 @@
...
@@ -4,10 +4,10 @@
# instead of passing them as command-line arguments.
# instead of passing them as command-line arguments.
#
#
# Usage:
# Usage:
# $ lm_eval --config
configs/default
_config.yaml
# $ lm_eval --config
templates/example_ci
_config.yaml
#
#
# You can override any values in this config with command-line arguments:
# You can override any values in this config with command-line arguments:
# $ lm_eval --config
configs/default
_config.yaml --model_args pretrained=gpt2 --tasks mmlu
# $ lm_eval --config
templates/example_ci
_config.yaml --model_args pretrained=gpt2 --tasks mmlu
#
#
# All parameters are optional and have the same meaning as their CLI counterparts.
# All parameters are optional and have the same meaning as their CLI counterparts.
...
@@ -17,9 +17,18 @@ model_args:
...
@@ -17,9 +17,18 @@ model_args:
dtype
:
float16
dtype
:
float16
tasks
:
tasks
:
-
hellaswag
-
hellaswag
-
gsm8k
-
arc_easy
batch_size
:
1
batch_size
:
1
device
:
mps
trust_remote_code
:
true
trust_remote_code
:
true
log_samples
:
true
log_samples
:
true
output_path
:
./test
output_path
:
./test
limit
:
10
gen_kwargs
:
do_sample
:
true
temperature
:
0.7
samples
:
hellaswag
:
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
]
arc_easy
:
[
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
]
metadata
:
name
:
Example CI Config
description
:
This is an example configuration file for testing purposes.
tests/models/test_neuralmagic.py
deleted
100644 → 0
View file @
15ce554c
import
pytest
from
lm_eval
import
evaluator
from
lm_eval.api.registry
import
get_model
SPARSEML_MODELS_TASKS
=
[
# loglikelihood
(
"facebook/opt-125m"
,
"lambada_openai"
),
# loglikelihood_rolling
(
"hf-internal-testing/tiny-random-gpt2"
,
"wikitext"
),
# generate_until
(
"mgoin/tiny-random-llama-2-quant"
,
"gsm8k"
),
]
DEEPSPARSE_MODELS_TASKS
=
[
# loglikelihood
(
"hf:mgoin/llama2.c-stories15M-quant-ds"
,
"lambada_openai"
),
# loglikelihood_rolling (not supported yet)
# ("hf:mgoin/llama2.c-stories15M-quant-ds", "wikitext"),
# generate_until
(
"hf:mgoin/llama2.c-stories15M-quant-ds"
,
"gsm8k"
),
]
@
pytest
.
mark
.
skip
(
reason
=
"test failing"
)
@
pytest
.
mark
.
parametrize
(
"model_id,task"
,
SPARSEML_MODELS_TASKS
)
def
test_sparseml_eval
(
model_id
,
task
):
lm
=
get_model
(
"sparseml"
).
create_from_arg_string
(
f
"pretrained=
{
model_id
}
"
,
{
"batch_size"
:
1
,
"device"
:
"cpu"
,
"dtype"
:
"float32"
,
},
)
limit
=
5
evaluator
.
simple_evaluate
(
model
=
lm
,
tasks
=
[
task
],
num_fewshot
=
0
,
limit
=
limit
,
)
@
pytest
.
mark
.
parametrize
(
"model_id,task"
,
DEEPSPARSE_MODELS_TASKS
)
def
test_deepsparse_eval
(
model_id
,
task
):
lm
=
get_model
(
"deepsparse"
).
create_from_arg_string
(
f
"pretrained=
{
model_id
}
"
,
{
"batch_size"
:
1
,
},
)
limit
=
5
evaluator
.
simple_evaluate
(
model
=
lm
,
tasks
=
[
task
],
num_fewshot
=
0
,
limit
=
limit
,
)
tests/scripts/test_zeno_visualize.py
0 → 100644
View file @
84d02f77
import
json
import
re
import
pytest
from
scripts.zeno_visualize
import
sanitize_string
@
pytest
.
skip
(
"requires zeno_client dependency"
)
def
test_zeno_sanitize_string
():
"""
Test that the model_args handling logic in zeno_visualize.py properly handles
different model_args formats (string and dictionary).
"""
# Define the process_model_args function that replicates the fixed logic in zeno_visualize.py
# Test case 1: model_args as a string
string_model_args
=
"pretrained=EleutherAI/pythia-160m,dtype=float32"
result_string
=
sanitize_string
(
string_model_args
)
expected_string
=
re
.
sub
(
r
"[\"<>:/\|\\?\*\[\]]+"
,
"__"
,
string_model_args
)
# Test case 2: model_args as a dictionary
dict_model_args
=
{
"pretrained"
:
"EleutherAI/pythia-160m"
,
"dtype"
:
"float32"
}
result_dict
=
sanitize_string
(
dict_model_args
)
expected_dict
=
re
.
sub
(
r
"[\"<>:/\|\\?\*\[\]]+"
,
"__"
,
json
.
dumps
(
dict_model_args
))
# Verify the results
assert
result_string
==
expected_string
assert
result_dict
==
expected_dict
# Also test that the sanitization works as expected
assert
":"
not
in
result_string
# No colons in sanitized output
assert
":"
not
in
result_dict
# No colons in sanitized output
assert
"/"
not
in
result_dict
# No slashes in sanitized output
assert
"<"
not
in
result_dict
# No angle brackets in sanitized output
if
__name__
==
"__main__"
:
test_zeno_sanitize_string
()
print
(
"All tests passed."
)
tests/test_metrics.py
View file @
84d02f77
import
unittest.mock
as
mock
from
lm_eval.api.metrics
import
_bootstrap_internal_no_mp
,
mean
from
lm_eval.api.task
import
ConfigurableTask
,
TaskConfig
from
lm_eval.api.task
import
ConfigurableTask
,
TaskConfig
...
@@ -149,8 +152,34 @@ def test_acc_mutual_info_without_metric():
...
@@ -149,8 +152,34 @@ def test_acc_mutual_info_without_metric():
assert
result_dict
[
"acc"
]
==
1.0
assert
result_dict
[
"acc"
]
==
1.0
def
test_bootstrap_internal_no_mp
():
"""Test basic functionality of _bootstrap_internal_no_mp"""
data
=
[
1
,
2
,
3
,
4
,
5
]
# Mock tqdm to avoid progress bar output during testing
with
mock
.
patch
(
"tqdm.tqdm"
)
as
mock_tqdm
:
mock_tqdm
.
return_value
=
range
(
1
)
# Single chunk
# Mock print to avoid output during testing
with
mock
.
patch
(
"builtins.print"
):
result
=
_bootstrap_internal_no_mp
(
mean
,
data
,
100
)
# Should return 100 bootstrap replicates
assert
len
(
result
)
==
100
# All results should be numbers (means)
assert
all
(
isinstance
(
x
,
(
int
,
float
))
for
x
in
result
)
# Bootstrap means should be close to original mean
bootstrap_mean
=
mean
(
result
)
original_mean
=
mean
(
data
)
assert
abs
(
bootstrap_mean
-
original_mean
)
<
0.5
# Should be reasonably close
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
test_acc_mutual_info_slicing
()
test_acc_mutual_info_slicing
()
test_acc_mutual_info_different_predictions
()
test_acc_mutual_info_different_predictions
()
test_acc_mutual_info_without_metric
()
test_acc_mutual_info_without_metric
()
test_bootstrap_internal_no_mp
()
print
(
"All tests passed!"
)
print
(
"All tests passed!"
)
tests/test_tasks.py
View file @
84d02f77
...
@@ -46,7 +46,6 @@ def limit() -> int:
...
@@ -46,7 +46,6 @@ def limit() -> int:
return
10
return
10
# Tests
class
BaseTasks
:
class
BaseTasks
:
"""
"""
Base class for testing tasks
Base class for testing tasks
...
@@ -166,45 +165,3 @@ class TestNewTasksElseDefault(BaseTasks):
...
@@ -166,45 +165,3 @@ class TestNewTasksElseDefault(BaseTasks):
Test class parameterized with a list of new/modified tasks
Test class parameterized with a list of new/modified tasks
(or a set of default tasks if none have been modified)
(or a set of default tasks if none have been modified)
"""
"""
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
(
[
"arc_easy_unitxt"
],
tasks
.
TaskManager
(
include_path
=
"./tests/testconfigs"
)
),
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
,
)
class
TestUnitxtTasks
(
BaseTasks
):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def
test_check_training_docs
(
self
,
task_class
:
ConfigurableTask
):
if
task_class
.
has_training_docs
():
assert
task_class
.
dataset
[
"train"
]
is
not
None
def
test_check_validation_docs
(
self
,
task_class
):
if
task_class
.
has_validation_docs
():
assert
task_class
.
dataset
[
"validation"
]
is
not
None
def
test_check_test_docs
(
self
,
task_class
):
task
=
task_class
if
task
.
has_test_docs
():
assert
task
.
dataset
[
"test"
]
is
not
None
def
test_doc_to_text
(
self
,
task_class
,
limit
:
int
):
task
=
task_class
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
if
not
task
.
multiple_input
:
for
x
in
_array
:
assert
isinstance
(
x
,
str
)
else
:
pass
tests/test_unitxt_tasks.py
0 → 100644
View file @
84d02f77
from
itertools
import
islice
import
pytest
from
lm_eval
import
tasks
as
tasks
from
lm_eval.api.task
import
ConfigurableTask
from
tests.test_tasks
import
BaseTasks
,
task_class
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
(
[
"arc_easy_unitxt"
],
tasks
.
TaskManager
(
include_path
=
"./tests/testconfigs"
)
),
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
,
)
class
TestUnitxtTasks
(
BaseTasks
):
"""
Test class for Unitxt tasks parameterized with a small custom
task as described here:
https://www.unitxt.ai/en/latest/docs/lm_eval.html
"""
def
test_check_training_docs
(
self
,
task_class
:
ConfigurableTask
):
if
task_class
.
has_training_docs
():
assert
task_class
.
dataset
[
"train"
]
is
not
None
def
test_check_validation_docs
(
self
,
task_class
):
if
task_class
.
has_validation_docs
():
assert
task_class
.
dataset
[
"validation"
]
is
not
None
def
test_check_test_docs
(
self
,
task_class
):
task
=
task_class
if
task
.
has_test_docs
():
assert
task
.
dataset
[
"test"
]
is
not
None
def
test_doc_to_text
(
self
,
task_class
,
limit
:
int
):
task
=
task_class
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
if
not
task
.
multiple_input
:
for
x
in
_array
:
assert
isinstance
(
x
,
str
)
else
:
pass
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment