Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b9ee592b
"docs/source/en/api/pipelines/unclip.mdx" did not exist on "b25843e799d42246d7a60808259dc5c28163446f"
Commit
b9ee592b
authored
Jul 04, 2025
by
Baber
Browse files
nit
parent
f3cfff61
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
43 additions
and
52 deletions
+43
-52
docs/interface.md
docs/interface.md
+1
-1
lm_eval/__main__.py
lm_eval/__main__.py
+3
-3
lm_eval/_cli/harness.py
lm_eval/_cli/harness.py
+4
-4
lm_eval/_cli/ls.py
lm_eval/_cli/ls.py
+9
-9
lm_eval/_cli/run.py
lm_eval/_cli/run.py
+8
-9
lm_eval/_cli/subcommand.py
lm_eval/_cli/subcommand.py
+0
-5
lm_eval/_cli/validate.py
lm_eval/_cli/validate.py
+2
-2
lm_eval/config/evaluate_config.py
lm_eval/config/evaluate_config.py
+16
-19
templates/example_ci_config.yaml
templates/example_ci_config.yaml
+0
-0
No files found.
docs/interface.md
View file @
b9ee592b
...
...
@@ -13,7 +13,7 @@ Equivalently, running the library can be done via the `lm-eval` entrypoint at th
The CLI now uses a subcommand structure for better organization:
-
`lm-eval run`
- Execute evaluations (default behavior)
-
`lm-eval l
ist
`
- List available tasks, models, etc.
-
`lm-eval l
s
`
- List available tasks, models, etc.
-
`lm-eval validate`
- Validate task configurations
For backward compatibility, if no subcommand is specified,
`run`
is automatically inserted. So
`lm-eval --model hf --tasks hellaswag`
is equivalent to
`lm-eval run --model hf --tasks hellaswag`
.
...
...
lm_eval/__main__.py
View file @
b9ee592b
from
lm_eval._cli.
eval
import
Eval
from
lm_eval._cli.
harness
import
HarnessCLI
from
lm_eval.utils
import
setup_logging
def
cli_evaluate
()
->
None
:
"""Main CLI entry point
with subcommand and legacy support
."""
"""Main CLI entry point."""
setup_logging
()
parser
=
Eval
()
parser
=
HarnessCLI
()
args
=
parser
.
parse_args
()
parser
.
execute
(
args
)
...
...
lm_eval/_cli/
eval
.py
→
lm_eval/_cli/
harness
.py
View file @
b9ee592b
...
...
@@ -2,12 +2,12 @@ import argparse
import
sys
import
textwrap
from
lm_eval._cli.l
istall
import
List
All
from
lm_eval._cli.l
s
import
List
from
lm_eval._cli.run
import
Run
from
lm_eval._cli.validate
import
Validate
class
Eval
:
class
HarnessCLI
:
"""Main CLI parser that manages all subcommands."""
def
__init__
(
self
):
...
...
@@ -20,7 +20,7 @@ class Eval:
lm-eval run --model hf --model_args pretrained=gpt2 --tasks hellaswag
# List available tasks
lm-eval l
ist
tasks
lm-eval l
s
tasks
# Validate task configurations
lm-eval validate --tasks hellaswag,arc_easy
...
...
@@ -40,7 +40,7 @@ class Eval:
dest
=
"command"
,
help
=
"Available commands"
,
metavar
=
"COMMAND"
)
Run
.
create
(
self
.
_subparsers
)
List
All
.
create
(
self
.
_subparsers
)
List
.
create
(
self
.
_subparsers
)
Validate
.
create
(
self
.
_subparsers
)
def
parse_args
(
self
)
->
argparse
.
Namespace
:
...
...
lm_eval/_cli/l
istall
.py
→
lm_eval/_cli/l
s
.py
View file @
b9ee592b
...
...
@@ -4,33 +4,33 @@ import textwrap
from
lm_eval._cli.subcommand
import
SubCommand
class
List
All
(
SubCommand
):
class
List
(
SubCommand
):
"""Command for listing available tasks."""
def
__init__
(
self
,
subparsers
:
argparse
.
_SubParsersAction
,
*
args
,
**
kwargs
):
# Create and configure the parser
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
_parser
=
subparsers
.
add_parser
(
"l
ist
"
,
"l
s
"
,
help
=
"List available tasks, groups, subtasks, or tags"
,
description
=
"List available tasks, groups, subtasks, or tags from the evaluation harness."
,
usage
=
"lm-eval list [tasks|groups|subtasks|tags] [--include_path DIR]"
,
epilog
=
textwrap
.
dedent
(
"""
examples:
# List all available tasks (includes groups, subtasks, and tags)
$ lm-eval l
ist
tasks
$ lm-eval l
s
tasks
# List only task groups (like 'mmlu', 'glue', 'superglue')
$ lm-eval l
ist
groups
$ lm-eval l
s
groups
# List only individual subtasks (like 'mmlu_abstract_algebra')
$ lm-eval l
ist
subtasks
$ lm-eval l
s
subtasks
# Include external task definitions
$ lm-eval l
ist
tasks --include_path /path/to/external/tasks
$ lm-eval l
s
tasks --include_path /path/to/external/tasks
# List tasks from multiple external paths
$ lm-eval l
ist
tasks --include_path "/path/to/tasks1:/path/to/tasks2"
$ lm-eval l
s
tasks --include_path "/path/to/tasks1:/path/to/tasks2"
organization:
• Groups: Collections of tasks with aggregated metric across subtasks (e.g., 'mmlu')
...
...
@@ -46,7 +46,7 @@ class ListAll(SubCommand):
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
)
self
.
_add_args
()
self
.
_parser
.
set_defaults
(
func
=
lambda
arg
:
self
.
_parser
.
print_help
()
)
self
.
_parser
.
set_defaults
(
func
=
self
.
_execute
)
def
_add_args
(
self
)
->
None
:
self
.
_parser
.
add_argument
(
...
...
@@ -63,7 +63,7 @@ class ListAll(SubCommand):
help
=
"Additional path to include if there are external tasks."
,
)
def
execute
(
self
,
args
:
argparse
.
Namespace
)
->
None
:
def
_
execute
(
self
,
args
:
argparse
.
Namespace
)
->
None
:
"""Execute the list command."""
from
lm_eval.tasks
import
TaskManager
...
...
lm_eval/_cli/run.py
View file @
b9ee592b
...
...
@@ -42,7 +42,7 @@ class Run(SubCommand):
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
)
self
.
_add_args
()
self
.
_parser
.
set_defaults
(
func
=
self
.
execute
)
self
.
_parser
.
set_defaults
(
func
=
self
.
_
execute
)
def
_add_args
(
self
)
->
None
:
self
.
_parser
=
self
.
_parser
...
...
@@ -313,14 +313,17 @@ class Run(SubCommand):
),
)
def
execute
(
self
,
args
:
argparse
.
Namespace
)
->
None
:
def
_
execute
(
self
,
args
:
argparse
.
Namespace
)
->
None
:
"""Runs the evaluation harness with the provided arguments."""
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
from
lm_eval.config.evaluate_config
import
EvaluatorConfig
# Create and validate config (most validation now happens in EvaluationConfig)
eval_logger
=
logging
.
getLogger
(
__name__
)
# Create and validate config (most validation now occurs in EvaluationConfig)
cfg
=
EvaluatorConfig
.
from_cli
(
args
)
from
lm_eval
import
simple_evaluate
,
utils
from
lm_eval
import
simple_evaluate
from
lm_eval.loggers
import
EvaluationTracker
,
WandbLogger
from
lm_eval.utils
import
handle_non_serializable
,
make_table
...
...
@@ -328,10 +331,6 @@ class Run(SubCommand):
if
cfg
.
wandb_args
:
wandb_logger
=
WandbLogger
(
cfg
.
wandb_args
,
cfg
.
wandb_config_args
)
utils
.
setup_logging
(
cfg
.
verbosity
)
eval_logger
=
logging
.
getLogger
(
__name__
)
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
# Set up evaluation tracker
if
cfg
.
output_path
:
cfg
.
hf_hub_log_args
[
"output_path"
]
=
cfg
.
output_path
...
...
@@ -342,7 +341,7 @@ class Run(SubCommand):
evaluation_tracker
=
EvaluationTracker
(
**
cfg
.
hf_hub_log_args
)
# Create task manager (metadata already set up in config validation)
task_manager
=
cfg
.
process_tasks
()
task_manager
=
cfg
.
process_tasks
(
cfg
.
metadata
)
# Validation warnings (keep these in CLI as they're logging-specific)
if
"push_samples_to_hub"
in
cfg
.
hf_hub_log_args
and
not
cfg
.
log_samples
:
...
...
lm_eval/_cli/subcommand.py
View file @
b9ee592b
...
...
@@ -17,8 +17,3 @@ class SubCommand(ABC):
def
_add_args
(
self
)
->
None
:
"""Add arguments specific to this subcommand."""
pass
@
abstractmethod
def
execute
(
self
,
args
:
argparse
.
Namespace
)
->
None
:
"""Execute the subcommand with the given arguments."""
pass
lm_eval/_cli/validate.py
View file @
b9ee592b
...
...
@@ -73,7 +73,7 @@ class Validate(SubCommand):
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
)
self
.
_add_args
()
self
.
_parser
.
set_defaults
(
func
=
lambda
arg
:
self
.
_parser
.
print_help
()
)
self
.
_parser
.
set_defaults
(
func
=
self
.
_execute
)
def
_add_args
(
self
)
->
None
:
self
.
_parser
.
add_argument
(
...
...
@@ -92,7 +92,7 @@ class Validate(SubCommand):
help
=
"Additional path to include if there are external tasks."
,
)
def
execute
(
self
,
args
:
argparse
.
Namespace
)
->
None
:
def
_
execute
(
self
,
args
:
argparse
.
Namespace
)
->
None
:
"""Execute the validate command."""
from
lm_eval.tasks
import
TaskManager
...
...
lm_eval/config/evaluate_config.py
View file @
b9ee592b
...
...
@@ -187,14 +187,6 @@ class EvaluatorConfig:
metadata
=
{
"help"
:
"Additional metadata for tasks that require it"
},
)
@
staticmethod
def
_parse_dict_args
(
config
:
Dict
[
str
,
Any
])
->
Dict
[
str
,
Any
]:
"""Parse string arguments that should be dictionaries."""
for
key
in
config
:
if
key
in
DICT_KEYS
and
isinstance
(
config
[
key
],
str
):
config
[
key
]
=
simple_parse_args_string
(
config
[
key
])
return
config
@
classmethod
def
from_cli
(
cls
,
namespace
:
Namespace
)
->
"EvaluatorConfig"
:
"""
...
...
@@ -206,7 +198,7 @@ class EvaluatorConfig:
# Load and merge YAML config if provided
if
used_config
:
=
hasattr
(
namespace
,
"config"
)
and
namespace
.
config
:
config
.
update
(
cls
.
_
load_yaml_config
(
namespace
.
config
))
config
.
update
(
cls
.
load_yaml_config
(
namespace
.
config
))
# Override with CLI args (only truthy values, exclude non-config args)
excluded_args
=
{
"config"
,
"command"
,
"func"
}
# argparse internal args
...
...
@@ -222,7 +214,7 @@ class EvaluatorConfig:
instance
=
cls
(
**
config
)
if
used_config
:
print
(
textwrap
.
dedent
(
f
"""
{
instance
}
"""
))
instance
.
validate_and_preprocess
()
instance
.
configure
()
return
instance
...
...
@@ -233,19 +225,24 @@ class EvaluatorConfig:
Merges with built-in defaults and validates.
"""
# Load YAML config
yaml_config
=
cls
.
_load_yaml_config
(
config_path
)
yaml_config
=
cls
.
load_yaml_config
(
config_path
)
# Parse string arguments that should be dictionaries
yaml_config
=
cls
.
_parse_dict_args
(
yaml_config
)
# Create instance and validate
instance
=
cls
(
**
yaml_config
)
instance
.
validate_and_preprocess
()
instance
.
configure
()
return
instance
@
staticmethod
def
_load_yaml_config
(
config_path
:
Union
[
str
,
Path
])
->
Dict
[
str
,
Any
]:
def
_parse_dict_args
(
config
:
Dict
[
str
,
Any
])
->
Dict
[
str
,
Any
]:
"""Parse string arguments that should be dictionaries."""
for
key
in
config
:
if
key
in
DICT_KEYS
and
isinstance
(
config
[
key
],
str
):
config
[
key
]
=
simple_parse_args_string
(
config
[
key
])
return
config
@
staticmethod
def
load_yaml_config
(
config_path
:
Union
[
str
,
Path
])
->
Dict
[
str
,
Any
]:
"""Load and validate YAML config file."""
config_file
=
(
Path
(
config_path
)
if
not
isinstance
(
config_path
,
Path
)
else
config_path
...
...
@@ -268,11 +265,11 @@ class EvaluatorConfig:
return
yaml_data
def
validate_and_preprocess
(
self
)
->
None
:
def
configure
(
self
)
->
None
:
"""Validate configuration and preprocess fields after creation."""
self
.
_validate_arguments
()
self
.
_process_arguments
()
self
.
_
apply
_trust_remote_code
()
self
.
_
set
_trust_remote_code
()
def
_validate_arguments
(
self
)
->
None
:
"""Validate configuration arguments and cross-field constraints."""
...
...
@@ -369,7 +366,7 @@ class EvaluatorConfig:
self
.
tasks
=
task_names
return
task_manager
def
_
apply
_trust_remote_code
(
self
)
->
None
:
def
_
set
_trust_remote_code
(
self
)
->
None
:
"""Apply trust_remote_code setting if enabled."""
if
self
.
trust_remote_code
:
# HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
...
...
configs/default
_config.yaml
→
templates/example_ci
_config.yaml
View file @
b9ee592b
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment