Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b89af51e
Commit
b89af51e
authored
Jul 10, 2025
by
Baber
Browse files
update default values; fixes
parent
fadd26e4
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
115 additions
and
57 deletions
+115
-57
.pre-commit-config.yaml
.pre-commit-config.yaml
+2
-3
lm_eval/__main__.py
lm_eval/__main__.py
+5
-0
lm_eval/_cli/run.py
lm_eval/_cli/run.py
+43
-21
lm_eval/api/task.py
lm_eval/api/task.py
+29
-5
lm_eval/config/evaluate_config.py
lm_eval/config/evaluate_config.py
+11
-15
lm_eval/config/metric.py
lm_eval/config/metric.py
+1
-1
lm_eval/config/task.py
lm_eval/config/task.py
+2
-2
lm_eval/evaluator.py
lm_eval/evaluator.py
+3
-1
pyproject.toml
pyproject.toml
+4
-4
templates/example_ci_config.yaml
templates/example_ci_config.yaml
+15
-5
No files found.
.pre-commit-config.yaml
View file @
b89af51e
...
@@ -29,12 +29,11 @@ repos:
...
@@ -29,12 +29,11 @@ repos:
-
id
:
mixed-line-ending
-
id
:
mixed-line-ending
args
:
[
--fix=lf
]
args
:
[
--fix=lf
]
-
repo
:
https://github.com/astral-sh/ruff-pre-commit
-
repo
:
https://github.com/astral-sh/ruff-pre-commit
rev
:
v0.12.
2
rev
:
v0.12.
5
hooks
:
hooks
:
# Run the linter.
# Run the linter.
-
id
:
ruff-check
-
id
:
ruff-check
args
:
[
--fix
]
args
:
[
--fix
]
# Run the formatter.
-
id
:
ruff-format
-
id
:
ruff-format
-
repo
:
https://github.com/codespell-project/codespell
-
repo
:
https://github.com/codespell-project/codespell
rev
:
v2.4.1
rev
:
v2.4.1
...
...
lm_eval/__main__.py
View file @
b89af51e
from
rich.traceback
import
install
from
lm_eval._cli.harness
import
HarnessCLI
from
lm_eval._cli.harness
import
HarnessCLI
from
lm_eval.utils
import
setup_logging
from
lm_eval.utils
import
setup_logging
install
(
show_locals
=
True
)
def
cli_evaluate
()
->
None
:
def
cli_evaluate
()
->
None
:
"""Main CLI entry point."""
"""Main CLI entry point."""
setup_logging
()
setup_logging
()
...
...
lm_eval/_cli/run.py
View file @
b89af51e
...
@@ -8,6 +8,8 @@ from functools import partial
...
@@ -8,6 +8,8 @@ from functools import partial
from
lm_eval._cli.subcommand
import
SubCommand
from
lm_eval._cli.subcommand
import
SubCommand
from
lm_eval._cli.utils
import
(
from
lm_eval._cli.utils
import
(
_int_or_none_list_arg_type
,
_int_or_none_list_arg_type
,
key_val_to_dict
,
merge_dicts
,
request_caching_arg_to_dict
,
request_caching_arg_to_dict
,
try_parse_json
,
try_parse_json
,
)
)
...
@@ -22,17 +24,17 @@ class Run(SubCommand):
...
@@ -22,17 +24,17 @@ class Run(SubCommand):
"run"
,
"run"
,
help
=
"Run the evaluation harness on specified tasks"
,
help
=
"Run the evaluation harness on specified tasks"
,
description
=
"Evaluate language models on various benchmarks and tasks."
,
description
=
"Evaluate language models on various benchmarks and tasks."
,
usage
=
"lm-eval run --model <model> --tasks <task
1,
task
2,...
> [options]"
,
usage
=
"lm-eval run --model <model> --tasks <task
> <
task
> --model_args <arg=value> <arg=value
> [options]"
,
epilog
=
textwrap
.
dedent
(
"""
epilog
=
textwrap
.
dedent
(
"""
examples:
examples:
# Basic evaluation with HuggingFace model
# Basic evaluation with HuggingFace model
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
$ lm-eval run --model hf --model_args pretrained=gpt2
dtype=float32
--tasks hellaswag
# Evaluate on multiple tasks with few-shot examples
# Evaluate on multiple tasks with few-shot examples
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy
,
arc_challenge --num_fewshot 5
$ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy
arc_challenge --num_fewshot 5
# Evaluation with custom generation parameters
# Evaluation with custom generation parameters
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs
"
temperature=0.8
,
top_p=0.95
"
$ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8
top_p=0.95
'stop=["
\\
n
\\
n"]'
# Use configuration file
# Use configuration file
$ lm-eval run --config my_config.yaml --tasks mmlu
$ lm-eval run --config my_config.yaml --tasks mmlu
...
@@ -73,9 +75,10 @@ class Run(SubCommand):
...
@@ -73,9 +75,10 @@ class Run(SubCommand):
"-t"
,
"-t"
,
default
=
None
,
default
=
None
,
type
=
str
,
type
=
str
,
metavar
=
"TASK1,TASK2"
,
nargs
=
"*"
,
metavar
=
"TASK1 TASK2"
,
help
=
textwrap
.
dedent
(
"""
help
=
textwrap
.
dedent
(
"""
Comma-separated list of task names or groupings.
Space or
Comma-separated list of task names or groupings.
Use 'lm-eval list tasks' to see all available tasks.
Use 'lm-eval list tasks' to see all available tasks.
"""
).
strip
(),
"""
).
strip
(),
)
)
...
@@ -83,9 +86,10 @@ class Run(SubCommand):
...
@@ -83,9 +86,10 @@ class Run(SubCommand):
"--model_args"
,
"--model_args"
,
"-a"
,
"-a"
,
default
=
None
,
default
=
None
,
type
=
try_parse_json
,
nargs
=
"*"
,
type
=
key_val_to_dict
,
metavar
=
"ARGS"
,
metavar
=
"ARGS"
,
help
=
"Model arguments as 'key=val,key2=val2' or
JSON string
"
,
help
=
"Model arguments as 'key=val,key2=val2' or
`key=val` `key2=val2`
"
,
)
)
# Evaluation Settings
# Evaluation Settings
...
@@ -124,10 +128,14 @@ class Run(SubCommand):
...
@@ -124,10 +128,14 @@ class Run(SubCommand):
)
)
eval_group
.
add_argument
(
eval_group
.
add_argument
(
"--gen_kwargs"
,
"--gen_kwargs"
,
type
=
try_parse_json
,
type
=
key_val_to_dict
,
default
=
None
,
default
=
None
,
nargs
=
"*"
,
metavar
=
"KWARGS"
,
metavar
=
"KWARGS"
,
help
=
"Generation arguments as 'key=val,key2=val2' or JSON string"
,
help
=
textwrap
.
dedent
(
'Generation arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`.'
"Values should be parsable with ast.literal_eval."
),
)
)
# Data and Output
# Data and Output
...
@@ -160,9 +168,10 @@ class Run(SubCommand):
...
@@ -160,9 +168,10 @@ class Run(SubCommand):
"-E"
,
"-E"
,
default
=
None
,
default
=
None
,
type
=
try_parse_json
,
type
=
try_parse_json
,
metavar
=
"JSON_FILE"
,
metavar
=
'"task1": [1,2,3,4,...]"'
,
help
=
textwrap
.
dedent
(
help
=
textwrap
.
dedent
(
'JSON file with specific sample indices for inputs: {"task_name":[indices],...}. Incompatible with --limit.'
"`...` `...` Sample indices for inputs. Incompatible with --limit."
" Values be parsable with ast.literal_eval."
),
),
)
)
...
@@ -250,24 +259,24 @@ class Run(SubCommand):
...
@@ -250,24 +259,24 @@ class Run(SubCommand):
)
)
logging_group
.
add_argument
(
logging_group
.
add_argument
(
"--wandb_args"
,
"--wandb_args"
,
type
=
str
,
type
=
key_val_to_dict
,
default
=
argparse
.
SUPPRESS
,
default
=
argparse
.
SUPPRESS
,
metavar
=
"ARGS"
,
metavar
=
"ARGS"
,
help
=
"Weights & Biases init arguments
(
key=val
,
key2=val2
)
"
,
help
=
"Weights & Biases init arguments key=val
key2=val2"
,
)
)
logging_group
.
add_argument
(
logging_group
.
add_argument
(
"--wandb_config_args"
,
"--wandb_config_args"
,
type
=
str
,
type
=
key_val_to_dict
,
default
=
argparse
.
SUPPRESS
,
default
=
argparse
.
SUPPRESS
,
metavar
=
"ARGS"
,
metavar
=
"ARGS"
,
help
=
"Weights & Biases config arguments
(
key=val
,
key2=val2
)
"
,
help
=
"Weights & Biases config arguments key=val
key2=val2"
,
)
)
logging_group
.
add_argument
(
logging_group
.
add_argument
(
"--hf_hub_log_args"
,
"--hf_hub_log_args"
,
type
=
str
,
type
=
key_val_to_dict
,
default
=
argparse
.
SUPPRESS
,
default
=
argparse
.
SUPPRESS
,
metavar
=
"ARGS"
,
metavar
=
"ARGS"
,
help
=
"Hugging Face Hub logging arguments
(
key=val
,
key2=val2
)
"
,
help
=
"Hugging Face Hub logging arguments key=val
key2=val2"
,
)
)
# Advanced Options
# Advanced Options
...
@@ -307,15 +316,28 @@ class Run(SubCommand):
...
@@ -307,15 +316,28 @@ class Run(SubCommand):
"--metadata"
,
"--metadata"
,
type
=
json
.
loads
,
type
=
json
.
loads
,
default
=
None
,
default
=
None
,
metavar
=
"
JSON
"
,
metavar
=
"
`key=val` `key2=val2`
"
,
help
=
textwrap
.
dedent
(
help
=
textwrap
.
dedent
(
"""JSON metadata for task configs (merged with model_args), required for some tasks such as RULER"""
"""`key=val` `key2=val` args parsable by ast.literal_eval (merged with model_args),
required for some tasks such as RULER"""
),
),
)
)
def
_execute
(
self
,
args
:
argparse
.
Namespace
)
->
None
:
@
staticmethod
def
_execute
(
args
:
argparse
.
Namespace
)
->
None
:
"""Runs the evaluation harness with the provided arguments."""
"""Runs the evaluation harness with the provided arguments."""
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
MERGE_ARGS_DICTS
=
[
"model_args"
,
"gen_kwargs"
,
"wandb_args"
,
"wandb_config_args"
,
"hf_hub_log_args"
,
]
for
arg_name
in
MERGE_ARGS_DICTS
:
if
current_value
:
=
getattr
(
args
,
arg_name
,
None
):
setattr
(
args
,
arg_name
,
merge_dicts
(
*
current_value
))
from
lm_eval.config.evaluate_config
import
EvaluatorConfig
from
lm_eval.config.evaluate_config
import
EvaluatorConfig
eval_logger
=
logging
.
getLogger
(
__name__
)
eval_logger
=
logging
.
getLogger
(
__name__
)
...
...
lm_eval/api/task.py
View file @
b89af51e
...
@@ -8,7 +8,6 @@ import re
...
@@ -8,7 +8,6 @@ import re
from
collections.abc
import
Callable
from
collections.abc
import
Callable
from
copy
import
deepcopy
from
copy
import
deepcopy
from
functools
import
cached_property
from
functools
import
cached_property
from
types
import
MethodType
from
typing
import
TYPE_CHECKING
,
Any
,
Literal
,
overload
from
typing
import
TYPE_CHECKING
,
Any
,
Literal
,
overload
import
datasets
import
datasets
...
@@ -523,8 +522,8 @@ class Task(abc.ABC):
...
@@ -523,8 +522,8 @@ class Task(abc.ABC):
# self.aggregation = lambda: {
# self.aggregation = lambda: {
# metric_name: get_metric_aggregation(metric_name)
# metric_name: get_metric_aggregation(metric_name)
# }
# }
setattr
(
self
.
_config
,
"
metric_list
"
,
[
MetricConfig
(
name
=
metric_name
)]
)
self
.
_config
.
metric_list
=
[
MetricConfig
(
name
=
metric_name
)]
setattr
(
self
.
_config
,
"
process_results
"
,
lambda
*
args
:
{
"bypass"
:
0
}
)
self
.
_config
.
process_results
=
lambda
*
args
:
{
"bypass"
:
0
}
def
set_fewshot_seed
(
self
,
seed
:
int
|
None
=
None
)
->
None
:
def
set_fewshot_seed
(
self
,
seed
:
int
|
None
=
None
)
->
None
:
self
.
fewshot_rnd
=
random
.
Random
(
seed
)
self
.
fewshot_rnd
=
random
.
Random
(
seed
)
...
@@ -656,6 +655,18 @@ class ConfigurableTask(Task):
...
@@ -656,6 +655,18 @@ class ConfigurableTask(Task):
)
)
self
.
task_docs
=
self
.
eval_docs
self
.
task_docs
=
self
.
eval_docs
# for name, fn in self.config._fn.items():
# if hasattr(self, name):
# setattr(
# self,
# name,
# types.MethodType(
# lambda self, *args, _fn=fn, **kwargs: _fn(*args, **kwargs),
# self,
# ),
# )
self
.
runtime_checks
(
self
.
task_docs
[
0
])
def
download
(
def
download
(
self
,
dataset_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
**
kwargs
self
,
dataset_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
**
kwargs
...
@@ -968,6 +979,8 @@ class ConfigurableTask(Task):
...
@@ -968,6 +979,8 @@ class ConfigurableTask(Task):
# if self.prompt is not None:
# if self.prompt is not None:
# doc_to_text = self.prompt
# doc_to_text = self.prompt
doc_to_text
=
doc_to_text
or
self
.
config
.
doc_to_text
doc_to_text
=
doc_to_text
or
self
.
config
.
doc_to_text
if
callable
(
doc_to_text
):
return
doc_to_text
(
doc
)
if
doc_to_text
in
doc
:
if
doc_to_text
in
doc
:
return
doc
[
doc_to_text
]
return
doc
[
doc_to_text
]
elif
isinstance
(
doc_to_text
,
str
):
elif
isinstance
(
doc_to_text
,
str
):
...
@@ -1013,6 +1026,8 @@ class ConfigurableTask(Task):
...
@@ -1013,6 +1026,8 @@ class ConfigurableTask(Task):
# if self.prompt is not None:
# if self.prompt is not None:
# doc_to_target = self.prompt
# doc_to_target = self.prompt
doc_to_target
=
doc_to_target
or
self
.
config
.
doc_to_target
doc_to_target
=
doc_to_target
or
self
.
config
.
doc_to_target
if
callable
(
doc_to_target
):
doc_to_target
(
doc
)
if
doc_to_target
in
doc
:
if
doc_to_target
in
doc
:
return
doc
[
doc_to_target
]
return
doc
[
doc_to_target
]
elif
isinstance
(
doc_to_target
,
str
):
elif
isinstance
(
doc_to_target
,
str
):
...
@@ -1274,6 +1289,8 @@ class ConfigurableTask(Task):
...
@@ -1274,6 +1289,8 @@ class ConfigurableTask(Task):
)
)
def
process_results
(
self
,
doc
:
dict
,
results
:
list
)
->
dict
[
str
,
Any
]:
def
process_results
(
self
,
doc
:
dict
,
results
:
list
)
->
dict
[
str
,
Any
]:
if
callable
(
self
.
config
.
process_results
):
return
self
.
config
.
process_results
(
doc
,
results
)
result_dict
=
{}
result_dict
=
{}
use_metric
=
list
(
m
.
metric_name
for
m
in
self
.
config
.
_metric_list
)
use_metric
=
list
(
m
.
metric_name
for
m
in
self
.
config
.
_metric_list
)
if
self
.
OUTPUT_TYPE
==
"loglikelihood"
:
if
self
.
OUTPUT_TYPE
==
"loglikelihood"
:
...
@@ -1423,6 +1440,7 @@ class ConfigurableTask(Task):
...
@@ -1423,6 +1440,7 @@ class ConfigurableTask(Task):
# Test One Doc
# Test One Doc
self
.
features
:
list
[
str
]
=
list
(
self
.
task_docs
.
features
.
keys
())
self
.
features
:
list
[
str
]
=
list
(
self
.
task_docs
.
features
.
keys
())
self
.
multiple_target
=
0
self
.
multiple_target
=
0
self
.
multiple_input
=
0
test_text
=
self
.
doc_to_text
(
test_doc
)
test_text
=
self
.
doc_to_text
(
test_doc
)
test_target
=
self
.
doc_to_target
(
test_doc
)
test_target
=
self
.
doc_to_target
(
test_doc
)
...
@@ -1430,13 +1448,19 @@ class ConfigurableTask(Task):
...
@@ -1430,13 +1448,19 @@ class ConfigurableTask(Task):
test_choice
=
self
.
doc_to_choice
(
test_doc
)
test_choice
=
self
.
doc_to_choice
(
test_doc
)
if
not
isinstance
(
test_choice
,
list
):
if
not
isinstance
(
test_choice
,
list
):
eval_logger
.
error
(
"doc_to_choice must return list"
)
eval_logger
.
error
(
"doc_to_choice must return list"
)
# else:
else
:
# num_choice = len(test_choice)
num_choice
=
len
(
test_choice
)
if
isinstance
(
test_text
,
int
):
eval_logger
.
debug
(
"doc_to_text returned an int. Assuming multiple inputs."
)
if
isinstance
(
test_text
,
int
):
if
isinstance
(
test_text
,
int
):
eval_logger
.
debug
(
eval_logger
.
debug
(
"doc_to_text returned an int. Assuming multiple inputs."
"doc_to_text returned an int. Assuming multiple inputs."
)
)
self
.
multiple_input
=
num_choice
else
:
else
:
test_choice
=
None
test_choice
=
None
...
...
lm_eval/config/evaluate_config.py
View file @
b89af51e
...
@@ -21,6 +21,7 @@ DICT_KEYS = [
...
@@ -21,6 +21,7 @@ DICT_KEYS = [
"hf_hub_log_args"
,
"hf_hub_log_args"
,
"metadata"
,
"metadata"
,
"model_args"
,
"model_args"
,
"gen_kwargs"
,
]
]
...
@@ -79,7 +80,7 @@ class EvaluatorConfig:
...
@@ -79,7 +80,7 @@ class EvaluatorConfig:
# Device
# Device
device
:
Optional
[
str
]
=
field
(
device
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Device to use (e.g. cuda, cuda:0, cpu)"
}
default
=
"cuda:0"
,
metadata
=
{
"help"
:
"Device to use (e.g. cuda, cuda:0, cpu)"
}
)
)
# Data sampling and limiting
# Data sampling and limiting
...
@@ -126,7 +127,10 @@ class EvaluatorConfig:
...
@@ -126,7 +127,10 @@ class EvaluatorConfig:
default
=
None
,
metadata
=
{
"help"
:
"Custom System instruction to add"
}
default
=
None
,
metadata
=
{
"help"
:
"Custom System instruction to add"
}
)
)
apply_chat_template
:
Union
[
bool
,
str
]
=
field
(
apply_chat_template
:
Union
[
bool
,
str
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Apply chat template to prompt"
}
default
=
False
,
metadata
=
{
"help"
:
"Apply chat template to prompt. Either True, or a string identifying the tokenizer template."
},
)
)
fewshot_as_multiturn
:
bool
=
field
(
fewshot_as_multiturn
:
bool
=
field
(
default
=
False
,
default
=
False
,
...
@@ -170,7 +174,7 @@ class EvaluatorConfig:
...
@@ -170,7 +174,7 @@ class EvaluatorConfig:
metadata
=
{
"help"
:
"Seeds for random, numpy, torch, fewshot (random)"
},
metadata
=
{
"help"
:
"Seeds for random, numpy, torch, fewshot (random)"
},
)
)
# Security
and safety
# Security
trust_remote_code
:
bool
=
field
(
trust_remote_code
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Trust remote code for HF datasets"
}
default
=
False
,
metadata
=
{
"help"
:
"Trust remote code for HF datasets"
}
)
)
...
@@ -201,7 +205,7 @@ class EvaluatorConfig:
...
@@ -201,7 +205,7 @@ class EvaluatorConfig:
config
.
update
(
cls
.
load_yaml_config
(
namespace
.
config
))
config
.
update
(
cls
.
load_yaml_config
(
namespace
.
config
))
# Override with CLI args (only truthy values, exclude non-config args)
# Override with CLI args (only truthy values, exclude non-config args)
excluded_args
=
{
"config"
,
"command"
,
"func"
}
# argparse internal args
excluded_args
=
{
"command"
,
"func"
}
# argparse internal args
cli_args
=
{
cli_args
=
{
k
:
v
for
k
,
v
in
vars
(
namespace
).
items
()
if
v
and
k
not
in
excluded_args
k
:
v
for
k
,
v
in
vars
(
namespace
).
items
()
if
v
and
k
not
in
excluded_args
}
}
...
@@ -252,7 +256,6 @@ class EvaluatorConfig:
...
@@ -252,7 +256,6 @@ class EvaluatorConfig:
try
:
try
:
yaml_data
=
yaml
.
safe_load
(
config_file
.
read_text
())
yaml_data
=
yaml
.
safe_load
(
config_file
.
read_text
())
print
(
textwrap
.
dedent
(
f
"""yaml:
{
yaml_data
}
"""
))
except
yaml
.
YAMLError
as
e
:
except
yaml
.
YAMLError
as
e
:
raise
ValueError
(
f
"Invalid YAML in
{
config_path
}
:
{
e
}
"
)
raise
ValueError
(
f
"Invalid YAML in
{
config_path
}
:
{
e
}
"
)
except
(
OSError
,
UnicodeDecodeError
)
as
e
:
except
(
OSError
,
UnicodeDecodeError
)
as
e
:
...
@@ -337,17 +340,10 @@ class EvaluatorConfig:
...
@@ -337,17 +340,10 @@ class EvaluatorConfig:
metadata
=
self
.
metadata
if
self
.
metadata
else
{},
metadata
=
self
.
metadata
if
self
.
metadata
else
{},
)
)
# self.tasks is a comma-separated string of task names
task_names
=
task_manager
.
match_tasks
(
self
.
tasks
)
if
isinstance
((
task_list
:
=
self
.
tasks
),
str
):
task_list
=
self
.
tasks
.
split
(
","
)
else
:
assert
isinstance
(
self
.
tasks
,
list
),
(
"`tasks` must be a comma delimited string of task names or list[str]."
)
task_names
=
task_manager
.
match_tasks
(
task_list
)
# Check for any individual task files in the list
# Check for any individual task files in the list
for
task
in
[
task
for
task
in
task_list
if
task
not
in
task_names
]:
for
task
in
[
task
for
task
in
self
.
tasks
if
task
not
in
task_names
]:
task_path
=
Path
(
task
)
task_path
=
Path
(
task
)
if
task_path
.
is_file
():
if
task_path
.
is_file
():
config
=
utils
.
load_yaml_config
(
str
(
task_path
))
config
=
utils
.
load_yaml_config
(
str
(
task_path
))
...
@@ -355,7 +351,7 @@ class EvaluatorConfig:
...
@@ -355,7 +351,7 @@ class EvaluatorConfig:
# Check for missing tasks
# Check for missing tasks
task_missing
=
[
task_missing
=
[
task
for
task
in
task_list
if
task
not
in
task_names
and
"*"
not
in
task
task
for
task
in
self
.
tasks
if
task
not
in
task_names
and
"*"
not
in
task
]
]
if
task_missing
:
if
task_missing
:
...
...
lm_eval/config/metric.py
View file @
b89af51e
...
@@ -38,7 +38,7 @@ class MetricConfig:
...
@@ -38,7 +38,7 @@ class MetricConfig:
return
is_higher_better
(
self
.
name
)
return
is_higher_better
(
self
.
name
)
return
self
.
higher_is_better
return
self
.
higher_is_better
def
compute
_metric
(
self
,
*
args
,
**
kwargs
)
->
Any
:
def
compute
(
self
,
*
args
,
**
kwargs
)
->
Any
:
"""Calculates the metric using the provided function and arguments."""
"""Calculates the metric using the provided function and arguments."""
if
self
.
fn
is
None
:
if
self
.
fn
is
None
:
raise
ValueError
(
f
"Metric function for
{
self
.
name
}
is not defined."
)
raise
ValueError
(
f
"Metric function for
{
self
.
name
}
is not defined."
)
...
...
lm_eval/config/task.py
View file @
b89af51e
...
@@ -10,7 +10,7 @@ import datasets
...
@@ -10,7 +10,7 @@ import datasets
from
lm_eval.api.filter
import
FilterEnsemble
from
lm_eval.api.filter
import
FilterEnsemble
from
lm_eval.api.instance
import
OutputType
from
lm_eval.api.instance
import
OutputType
from
lm_eval.config.metric
import
MetricConfig
from
lm_eval.config.metric
import
MetricConfig
from
lm_eval.config.utils
import
doc_to_closure
,
maybe_serialize
from
lm_eval.config.utils
import
maybe_serialize
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
...
@@ -364,7 +364,7 @@ class TaskConfig:
...
@@ -364,7 +364,7 @@ class TaskConfig:
@
classmethod
@
classmethod
def
from_yaml
(
cls
,
data
:
dict
[
str
,
Any
])
->
TaskConfig
:
def
from_yaml
(
cls
,
data
:
dict
[
str
,
Any
])
->
TaskConfig
:
"""Create a TaskConfig instance from a YAML-like dictionary."""
"""Create a TaskConfig instance from a YAML-like dictionary."""
fn
=
{
k
:
doc_to_closure
(
v
)
for
k
,
v
in
data
.
items
()
if
callable
(
v
)}
fn
=
{
k
:
v
for
k
,
v
in
data
.
items
()
if
callable
(
v
)}
return
cls
(
**
data
,
_fn
=
fn
)
return
cls
(
**
data
,
_fn
=
fn
)
@
classmethod
@
classmethod
...
...
lm_eval/evaluator.py
View file @
b89af51e
...
@@ -475,7 +475,9 @@ def evaluate(
...
@@ -475,7 +475,9 @@ def evaluate(
"Either 'limit' or 'samples' must be None, but both are not None."
"Either 'limit' or 'samples' must be None, but both are not None."
)
)
if
samples
is
not
None
:
if
samples
is
not
None
:
eval_logger
.
info
(
f
"Evaluating examples for tasks
{
list
(
samples
.
keys
())
}
"
)
eval_logger
.
info
(
f
"Evaluating examples for tasks
{
[
x
for
x
in
list
(
samples
.
keys
())
if
x
in
task_dict
.
keys
()]
}
"
)
if
apply_chat_template
:
if
apply_chat_template
:
eval_logger
.
warning
(
eval_logger
.
warning
(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
...
...
pyproject.toml
View file @
b89af51e
...
@@ -11,10 +11,10 @@ authors = [
...
@@ -11,10 +11,10 @@ authors = [
description
=
"A framework for evaluating language models"
description
=
"A framework for evaluating language models"
readme
=
"README.md"
readme
=
"README.md"
classifiers
=
[
classifiers
=
[
"Development Status :: 3 - Alpha"
,
"Development Status :: 3 - Alpha"
,
"Programming Language :: Python :: 3"
,
"Programming Language :: Python :: 3"
,
"License :: OSI Approved :: MIT License"
,
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
"Operating System :: OS Independent"
,
]
]
requires-python
=
">=3.9"
requires-python
=
">=3.9"
license
=
{
"text"
=
"MIT"
}
license
=
{
"text"
=
"MIT"
}
...
...
templates/example_ci_config.yaml
View file @
b89af51e
...
@@ -4,11 +4,12 @@
...
@@ -4,11 +4,12 @@
# instead of passing them as command-line arguments.
# instead of passing them as command-line arguments.
#
#
# Usage:
# Usage:
# $ lm_eval --config
configs/default
_config.yaml
# $ lm_eval --config
templates/example_ci
_config.yaml
#
#
# You can override any values in this config with command-line arguments:
# You can override any values in this config with
further
command-line arguments:
# $ lm_eval --config
configs/default
_config.yaml --model_args pretrained=gpt2 --tasks mmlu
# $ lm_eval --config
templates/example_ci
_config.yaml --model_args pretrained=gpt2 --tasks mmlu
#
#
# For expected types and values, refer to EvaluatorConfig in lm_eval/config/evaluate_config.py
# All parameters are optional and have the same meaning as their CLI counterparts.
# All parameters are optional and have the same meaning as their CLI counterparts.
model
:
hf
model
:
hf
...
@@ -17,9 +18,18 @@ model_args:
...
@@ -17,9 +18,18 @@ model_args:
dtype
:
float16
dtype
:
float16
tasks
:
tasks
:
-
hellaswag
-
hellaswag
-
gsm8k
-
arc_easy
batch_size
:
1
batch_size
:
1
trust_remote_code
:
true
trust_remote_code
:
true
log_samples
:
true
log_samples
:
true
output_path
:
./test
output_path
:
./test
limit
:
10
gen_kwargs
:
do_sample
:
true
temperature
:
0.7
stop
:
[
"
\n
"
,
"
<|endoftext|>"
]
samples
:
hellaswag
:
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
]
arc_easy
:
[
10
,
20
,
30
,
40
,
50
,
60
,
70
,
80
,
90
,
100
]
metadata
:
name
:
Example CI Config
description
:
This is an example configuration file for testing purposes.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment