Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
91e49e23
Commit
91e49e23
authored
Jul 10, 2025
by
Baber
Browse files
fix: update default values and improve help text in configuration files
parent
84d02f77
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
27 additions
and
25 deletions
+27
-25
lm_eval/_cli/run.py
lm_eval/_cli/run.py
+10
-7
lm_eval/config/evaluate_config.py
lm_eval/config/evaluate_config.py
+11
-15
lm_eval/evaluator.py
lm_eval/evaluator.py
+3
-1
templates/example_ci_config.yaml
templates/example_ci_config.yaml
+3
-2
No files found.
lm_eval/_cli/run.py
View file @
91e49e23
...
...
@@ -24,7 +24,7 @@ class Run(SubCommand):
"run"
,
help
=
"Run the evaluation harness on specified tasks"
,
description
=
"Evaluate language models on various benchmarks and tasks."
,
usage
=
"lm-eval run --model <model> --tasks <task
1,
task
2,...
> [options]"
,
usage
=
"lm-eval run --model <model> --tasks <task
> <
task
> --model_args <arg=value> <arg=value
> [options]"
,
epilog
=
textwrap
.
dedent
(
"""
examples:
# Basic evaluation with HuggingFace model
...
...
@@ -34,7 +34,7 @@ class Run(SubCommand):
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5
# Evaluation with custom generation parameters
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95
'stop=["
\\
n
\\
n"]'
# Use configuration file
$ lm-eval run --config my_config.yaml --tasks mmlu
...
...
@@ -133,7 +133,8 @@ class Run(SubCommand):
nargs
=
"*"
,
metavar
=
"KWARGS"
,
help
=
textwrap
.
dedent
(
'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`. Values should be parsable with ast.literal_eval.'
'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`.'
"Values should be parsable with ast.literal_eval."
),
)
...
...
@@ -167,9 +168,10 @@ class Run(SubCommand):
"-E"
,
default
=
None
,
type
=
try_parse_json
,
metavar
=
"JSON_FILE"
,
metavar
=
'"task1": [1,2,3,4,...]"'
,
help
=
textwrap
.
dedent
(
'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.'
"`...` `...` Sample indices for inputs. Incompatible with --limit."
" Values be parsable with ast.literal_eval."
),
)
...
...
@@ -314,9 +316,10 @@ class Run(SubCommand):
"--metadata"
,
type
=
json
.
loads
,
default
=
None
,
metavar
=
"
JSON
"
,
metavar
=
"
`key=val` `key2=val2`
"
,
help
=
textwrap
.
dedent
(
"""JSON metadata for task configs (merged with model_args), required for some tasks such as RULER"""
"""`key=val` `key2=val` args parsable by ast.literal_eval (merged with model_args),
required for some tasks such as RULER"""
),
)
...
...
lm_eval/config/evaluate_config.py
View file @
91e49e23
...
...
@@ -21,6 +21,7 @@ DICT_KEYS = [
"hf_hub_log_args"
,
"metadata"
,
"model_args"
,
"gen_kwargs"
,
]
...
...
@@ -79,7 +80,7 @@ class EvaluatorConfig:
# Device
device
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Device to use (e.g. cuda, cuda:0, cpu)"
}
default
=
"cuda:0"
,
metadata
=
{
"help"
:
"Device to use (e.g. cuda, cuda:0, cpu)"
}
)
# Data sampling and limiting
...
...
@@ -126,7 +127,10 @@ class EvaluatorConfig:
default
=
None
,
metadata
=
{
"help"
:
"Custom System instruction to add"
}
)
apply_chat_template
:
Union
[
bool
,
str
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Apply chat template to prompt"
}
default
=
False
,
metadata
=
{
"help"
:
"Apply chat template to prompt. Either True, or a string identifying the tokenizer template."
},
)
fewshot_as_multiturn
:
bool
=
field
(
default
=
False
,
...
...
@@ -170,7 +174,7 @@ class EvaluatorConfig:
metadata
=
{
"help"
:
"Seeds for random, numpy, torch, fewshot (random)"
},
)
# Security
and safety
# Security
trust_remote_code
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Trust remote code for HF datasets"
}
)
...
...
@@ -201,7 +205,7 @@ class EvaluatorConfig:
config
.
update
(
cls
.
load_yaml_config
(
namespace
.
config
))
# Override with CLI args (only truthy values, exclude non-config args)
excluded_args
=
{
"config"
,
"command"
,
"func"
}
# argparse internal args
excluded_args
=
{
"command"
,
"func"
}
# argparse internal args
cli_args
=
{
k
:
v
for
k
,
v
in
vars
(
namespace
).
items
()
if
v
and
k
not
in
excluded_args
}
...
...
@@ -252,7 +256,6 @@ class EvaluatorConfig:
try
:
yaml_data
=
yaml
.
safe_load
(
config_file
.
read_text
())
print
(
textwrap
.
dedent
(
f
"""yaml:
{
yaml_data
}
"""
))
except
yaml
.
YAMLError
as
e
:
raise
ValueError
(
f
"Invalid YAML in
{
config_path
}
:
{
e
}
"
)
except
(
OSError
,
UnicodeDecodeError
)
as
e
:
...
...
@@ -337,17 +340,10 @@ class EvaluatorConfig:
metadata
=
self
.
metadata
if
self
.
metadata
else
{},
)
# self.tasks is a comma-separated string of task names
if
isinstance
((
task_list
:
=
self
.
tasks
),
str
):
task_list
=
self
.
tasks
.
split
(
","
)
else
:
assert
isinstance
(
self
.
tasks
,
list
),
(
"`tasks` must be a comma delimited string of task names or list[str]."
)
task_names
=
task_manager
.
match_tasks
(
task_list
)
task_names
=
task_manager
.
match_tasks
(
self
.
tasks
)
# Check for any individual task files in the list
for
task
in
[
task
for
task
in
task_list
if
task
not
in
task_names
]:
for
task
in
[
task
for
task
in
self
.
tasks
if
task
not
in
task_names
]:
task_path
=
Path
(
task
)
if
task_path
.
is_file
():
config
=
utils
.
load_yaml_config
(
str
(
task_path
))
...
...
@@ -355,7 +351,7 @@ class EvaluatorConfig:
# Check for missing tasks
task_missing
=
[
task
for
task
in
task_list
if
task
not
in
task_names
and
"*"
not
in
task
task
for
task
in
self
.
tasks
if
task
not
in
task_names
and
"*"
not
in
task
]
if
task_missing
:
...
...
lm_eval/evaluator.py
View file @
91e49e23
...
...
@@ -467,7 +467,9 @@ def evaluate(
"Either 'limit' or 'samples' must be None, but both are not None."
)
if
samples
is
not
None
:
eval_logger
.
info
(
f
"Evaluating examples for tasks
{
list
(
samples
.
keys
())
}
"
)
eval_logger
.
info
(
f
"Evaluating examples for tasks
{
[
x
for
x
in
list
(
samples
.
keys
())
if
x
in
task_dict
.
keys
()]
}
"
)
if
apply_chat_template
:
eval_logger
.
warning
(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
...
...
templates/example_ci_config.yaml
View file @
91e49e23
...
...
@@ -6,9 +6,10 @@
# Usage:
# $ lm_eval --config templates/example_ci_config.yaml
#
# You can override any values in this config with command-line arguments:
# You can override any values in this config with
further
command-line arguments:
# $ lm_eval --config templates/example_ci_config.yaml --model_args pretrained=gpt2 --tasks mmlu
#
# For expected types and values, refer to EvaluatorConfig in lm_eval/config/evaluate_config.py
# All parameters are optional and have the same meaning as their CLI counterparts.
model
:
hf
...
...
@@ -19,13 +20,13 @@ tasks:
-
hellaswag
-
arc_easy
batch_size
:
1
device
:
mps
trust_remote_code
:
true
log_samples
:
true
output_path
:
./test
gen_kwargs
:
do_sample
:
true
temperature
:
0.7
stop
:
[
"
\n
"
,
"
<|endoftext|>"
]
samples
:
hellaswag
:
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
]
arc_easy
:
[
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment