Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
1f351067
Commit
1f351067
authored
Sep 18, 2023
by
lintangsutawika
Browse files
Merge branch 'big-refactor' of
https://github.com/EleutherAI/lm-evaluation-harness
into qasper
parents
50f4428b
33d52483
Changes
19
Show whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
322 additions
and
161 deletions
+322
-161
.github/workflows/new_tasks.yml
.github/workflows/new_tasks.yml
+2
-2
lm_eval/api/model.py
lm_eval/api/model.py
+23
-3
lm_eval/api/task.py
lm_eval/api/task.py
+21
-14
lm_eval/benchmarks/__init__.py
lm_eval/benchmarks/__init__.py
+1
-1
lm_eval/benchmarks/pythia.yaml
lm_eval/benchmarks/pythia.yaml
+4
-4
lm_eval/evaluator.py
lm_eval/evaluator.py
+106
-37
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+6
-3
lm_eval/prompts/__init__.py
lm_eval/prompts/__init__.py
+11
-2
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+20
-9
lm_eval/tasks/nq_open/README.md
lm_eval/tasks/nq_open/README.md
+0
-0
lm_eval/tasks/nq_open/nq_open.yaml
lm_eval/tasks/nq_open/nq_open.yaml
+30
-0
lm_eval/tasks/translation/utils.py
lm_eval/tasks/translation/utils.py
+1
-1
lm_eval/utils.py
lm_eval/utils.py
+5
-6
main.py
main.py
+2
-2
pyproject.toml
pyproject.toml
+81
-0
scripts/write_out.py
scripts/write_out.py
+3
-1
setup.py
setup.py
+2
-74
tests/test_evaluator.py
tests/test_evaluator.py
+2
-1
tests/utils.py
tests/utils.py
+2
-1
No files found.
.github/workflows/new_tasks.yml
View file @
1f351067
...
@@ -63,10 +63,10 @@ jobs:
...
@@ -63,10 +63,10 @@ jobs:
-
name
:
Test with pytest
-
name
:
Test with pytest
# if new tasks are added, run tests on them
# if new tasks are added, run tests on them
if
:
steps.changed-tasks.outputs.tasks_any_modified == 'true'
if
:
steps.changed-tasks.outputs.tasks_any_modified == 'true'
run
:
python -m pytest tests/test_tasks.py -s -vv
-n=auto
run
:
python -m pytest tests/test_tasks.py -s -vv
# if api is modified, run tests on it
# if api is modified, run tests on it
-
name
:
Test more tasks with pytest
-
name
:
Test more tasks with pytest
env
:
env
:
API
:
true
API
:
true
if
:
steps.changed-tasks.outputs.api_any_modified == 'true'
if
:
steps.changed-tasks.outputs.api_any_modified == 'true'
run
:
python -m pytest tests/test_tasks.py -s -vv
-n=auto
run
:
python -m pytest tests/test_tasks.py -s -vv
lm_eval/api/model.py
View file @
1f351067
import
abc
import
abc
import
os
import
os
from
typing
import
Union
,
List
,
Tuple
import
torch
from
typing
import
Union
,
List
,
Tuple
,
Optional
,
Type
,
TypeVar
from
sqlitedict
import
SqliteDict
from
sqlitedict
import
SqliteDict
import
json
import
json
import
hashlib
import
hashlib
...
@@ -11,6 +12,8 @@ from tqdm import tqdm
...
@@ -11,6 +12,8 @@ from tqdm import tqdm
from
lm_eval
import
utils
from
lm_eval
import
utils
from
lm_eval.logger
import
eval_logger
from
lm_eval.logger
import
eval_logger
T
=
TypeVar
(
"T"
,
bound
=
"LM"
)
class
LM
(
abc
.
ABC
):
class
LM
(
abc
.
ABC
):
def
__init__
(
self
)
->
None
:
def
__init__
(
self
)
->
None
:
...
@@ -111,11 +114,28 @@ class LM(abc.ABC):
...
@@ -111,11 +114,28 @@ class LM(abc.ABC):
pass
pass
@
classmethod
@
classmethod
def
create_from_arg_string
(
cls
,
arg_string
,
additional_config
=
None
):
def
create_from_arg_string
(
cls
:
Type
[
T
],
arg_string
:
str
,
additional_config
:
Optional
[
dict
]
=
None
)
->
T
:
"""
Creates an instance of the LM class using the given argument string and additional config.
Parameters:
- arg_string: A string containing arguments in the format key1=value1,key2=value2.
- additional_config: Optional dictionary containing additional configuration parameters.
Returns:
- Instance of the LM class.
"""
additional_config
=
{}
if
additional_config
is
None
else
additional_config
additional_config
=
{}
if
additional_config
is
None
else
additional_config
args
=
utils
.
simple_parse_args_string
(
arg_string
)
args
=
utils
.
simple_parse_args_string
(
arg_string
)
args2
=
{
k
:
v
for
k
,
v
in
additional_config
.
items
()
if
v
is
not
None
}
args2
=
{
k
:
v
for
k
,
v
in
additional_config
.
items
()
if
v
is
not
None
}
if
args2
.
get
(
"device"
)
==
"mps"
or
args
.
get
(
"device"
)
==
"mps"
:
# TODO: delete once float16 MPS is fixed in torch stable
if
(
args2
.
get
(
"device"
)
in
(
"mps"
,
"mps:0"
)
or
args
.
get
(
"device"
)
in
(
"mps"
,
"mps:0"
)
and
"dev"
not
in
torch
.
__version__
):
args
[
"dtype"
]
=
"float32"
args
[
"dtype"
]
=
"float32"
return
cls
(
**
args
,
**
args2
)
return
cls
(
**
args
,
**
args2
)
...
...
lm_eval/api/task.py
View file @
1f351067
...
@@ -674,11 +674,11 @@ class ConfigurableTask(Task):
...
@@ -674,11 +674,11 @@ class ConfigurableTask(Task):
check_choices
=
test_choice
check_choices
=
test_choice
else
:
else
:
check_choices
=
[
test_target
]
check_choices
=
[
test_target
]
if
self
.
config
.
doc_to_choice
is
not
None
:
for
choice
in
check_choices
:
for
choice
in
check_choices
:
choice_has_whitespace
=
True
if
" "
in
choice
else
False
choice_has_whitespace
=
True
if
choice
[
0
].
isspace
()
else
False
delimiter_has_whitespace
=
(
delimiter_has_whitespace
=
(
True
if
" "
in
self
.
config
.
target_delimiter
else
False
True
if
self
.
config
.
target_delimiter
[
-
1
].
isspace
()
else
False
)
)
if
delimiter_has_whitespace
and
choice_has_whitespace
:
if
delimiter_has_whitespace
and
choice_has_whitespace
:
...
@@ -1067,6 +1067,9 @@ class ConfigurableTask(Task):
...
@@ -1067,6 +1067,9 @@ class ConfigurableTask(Task):
# it assumes that doc_to_target returns a number.
# it assumes that doc_to_target returns a number.
choices
=
self
.
doc_to_choice
(
doc
)
choices
=
self
.
doc_to_choice
(
doc
)
gold
=
choices
[
gold
]
gold
=
choices
[
gold
]
# we expect multiple_targets to be a list.
elif
self
.
multiple_target
:
gold
=
list
(
gold
)
else
:
else
:
gold
=
str
(
gold
)
gold
=
str
(
gold
)
...
@@ -1077,6 +1080,10 @@ class ConfigurableTask(Task):
...
@@ -1077,6 +1080,10 @@ class ConfigurableTask(Task):
# return true if any are true
# return true if any are true
# TODO: this may break for multipLe_target, non zero-or-1 metrics
# TODO: this may break for multipLe_target, non zero-or-1 metrics
scores
=
[]
scores
=
[]
if
not
isinstance
(
gold
,
list
):
# sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
# print(gold)
gold
=
[
gold
]
for
gold_option
in
gold
:
for
gold_option
in
gold
:
try
:
try
:
result_score
=
self
.
_metric_fn_list
[
metric
](
result_score
=
self
.
_metric_fn_list
[
metric
](
...
...
lm_eval/benchmarks/__init__.py
View file @
1f351067
...
@@ -44,7 +44,7 @@ def include_benchmarks(task_dir: str) -> None:
...
@@ -44,7 +44,7 @@ def include_benchmarks(task_dir: str) -> None:
task_names
=
utils
.
pattern_match
(
task_list
,
ALL_TASKS
)
task_names
=
utils
.
pattern_match
(
task_list
,
ALL_TASKS
)
for
task
in
task_names
:
for
task
in
task_names
:
if
task
in
TASK_REGISTRY
:
if
(
task
in
TASK_REGISTRY
)
or
(
task
in
GROUP_REGISTRY
)
:
if
group
in
GROUP_REGISTRY
:
if
group
in
GROUP_REGISTRY
:
GROUP_REGISTRY
[
group
].
append
(
task
)
GROUP_REGISTRY
[
group
].
append
(
task
)
else
:
else
:
...
...
lm_eval/benchmarks/pythia.yaml
View file @
1f351067
group
:
pythia
group
:
pythia
task
:
task
:
-
lambada_openai
-
lambada_openai
-
wikitext
-
logiqa
-
piqa
-
piqa
-
sciq
-
sciq
-
w
sc
-
w
ikitext
-
winogrande
-
winogrande
-
ar
c
-
ws
c
-
logiqa
-
ai2_arc
-
blimp
-
blimp
-
hendrycksTest*
-
hendrycksTest*
lm_eval/evaluator.py
View file @
1f351067
...
@@ -120,6 +120,8 @@ def simple_evaluate(
...
@@ -120,6 +120,8 @@ def simple_evaluate(
task_obj
=
task_dict
[
task_name
]
task_obj
=
task_dict
[
task_name
]
if
type
(
task_obj
)
==
tuple
:
if
type
(
task_obj
)
==
tuple
:
group
,
task_obj
=
task_obj
group
,
task_obj
=
task_obj
if
task_obj
is
None
:
continue
config
=
task_obj
.
_config
config
=
task_obj
.
_config
if
num_fewshot
is
not
None
:
if
num_fewshot
is
not
None
:
...
@@ -209,23 +211,30 @@ def evaluate(
...
@@ -209,23 +211,30 @@ def evaluate(
samples
=
collections
.
defaultdict
(
list
)
samples
=
collections
.
defaultdict
(
list
)
# tracks all Instances/requests a model must generate output on.
# tracks all Instances/requests a model must generate output on.
requests
=
collections
.
defaultdict
(
list
)
requests
=
collections
.
defaultdict
(
list
)
#
Stores
task scores
based on task
group
ing.
#
Aggregated
task scores
presented with
group
s
aggregate
=
collections
.
defaultdict
(
dict
)
results_agg
=
collections
.
defaultdict
(
dict
)
#
tracks if a task was chosen via user selecting a group containing it
#
Aggregated groups scores only
task_
groups
=
collections
.
defaultdict
(
dict
)
groups
_agg
=
collections
.
defaultdict
(
dict
)
# stores the amount to pad out reqs per req. type so that
# stores the amount to pad out reqs per req. type so that
# number of fwd passes per distributed rank is equal
# number of fwd passes per distributed rank is equal
padding_requests
=
collections
.
defaultdict
(
int
)
padding_requests
=
collections
.
defaultdict
(
int
)
# store the hierarchy to do proper ordering
# Stores group related keys and values for group-aggregation
task_hierarchy
=
collections
.
defaultdict
(
list
)
task_groups
=
collections
.
defaultdict
(
dict
)
# store the ordering of tasks and groups
task_order
=
collections
.
defaultdict
(
int
)
# store the aggregation for aggregating across tasks in the same group
sample_agg_fn
=
collections
.
defaultdict
(
dict
)
# get lists of each type of request
# get lists of each type of request
for
task_name
,
task
in
task_dict
.
items
():
for
task_name
,
task
in
task_dict
.
items
():
if
type
(
task
)
==
tuple
:
if
type
(
task
)
==
tuple
:
group
,
task
=
task
group_name
,
task
=
task
task_groups
[
task_name
]
=
group
task_hierarchy
[
group_name
].
append
(
task_name
)
aggregate
[
task_name
]
=
{}
else
:
task_hierarchy
[
task_name
]
=
[]
if
task
is
None
:
continue
versions
[
task_name
]
=
task
.
VERSION
versions
[
task_name
]
=
task
.
VERSION
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
...
@@ -301,6 +310,8 @@ def evaluate(
...
@@ -301,6 +310,8 @@ def evaluate(
for
task_name
,
task
in
task_dict
.
items
():
for
task_name
,
task
in
task_dict
.
items
():
if
type
(
task
)
==
tuple
:
if
type
(
task
)
==
tuple
:
group
,
task
=
task
group
,
task
=
task
if
task
is
None
:
continue
task
.
apply_filters
()
task
.
apply_filters
()
### Collect values of metrics on all datapoints ###
### Collect values of metrics on all datapoints ###
...
@@ -310,6 +321,8 @@ def evaluate(
...
@@ -310,6 +321,8 @@ def evaluate(
for
task_name
,
task
in
task_dict
.
items
():
for
task_name
,
task
in
task_dict
.
items
():
if
type
(
task
)
==
tuple
:
if
type
(
task
)
==
tuple
:
group
,
task
=
task
group
,
task
=
task
if
task
is
None
:
continue
# TODO: make it possible to use a different metric per filter
# TODO: make it possible to use a different metric per filter
# iterate over different filters used
# iterate over different filters used
for
key
in
task
.
instances
[
0
].
filtered_resps
.
keys
():
for
key
in
task
.
instances
[
0
].
filtered_resps
.
keys
():
...
@@ -396,27 +409,64 @@ def evaluate(
...
@@ -396,27 +409,64 @@ def evaluate(
vals
=
vals_torch
vals
=
vals_torch
if
lm
.
rank
==
0
:
if
lm
.
rank
==
0
:
### Get task ordering for correct sample-wide aggregation
group_to_task
=
{}
for
group
in
task_hierarchy
.
keys
():
if
group
not
in
task_order
:
task_order
[
group
]
=
0
if
len
(
task_hierarchy
[
group
])
>
0
:
group_to_task
[
group
]
=
task_hierarchy
[
group
].
copy
()
for
task
in
task_hierarchy
[
group
]:
if
task
in
task_order
:
task_order
[
task
]
+=
1
else
:
task_order
[
task
]
=
1
+
task_order
[
group
]
if
task
in
task_hierarchy
:
group_to_task
[
group
].
remove
(
task
)
group_to_task
[
group
].
extend
(
task_hierarchy
[
task
])
task_to_group
=
{}
for
group
in
group_to_task
:
for
task
in
group_to_task
[
group
]:
if
task
in
task_to_group
:
task_to_group
[
task
].
append
(
group
)
else
:
task_to_group
[
task
]
=
[
group
]
### Aggregate results over all datapoints ###
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
# aggregate results ; run bootstrap CIs
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
for
(
task_name
,
key
,
metric
),
items
in
vals
.
items
():
task
=
task_dict
[
task_name
]
task
=
task_dict
[
task_name
]
metric_key
=
metric
+
","
+
key
if
type
(
task
)
==
tuple
:
if
type
(
task
)
==
tuple
:
group
,
task
=
task
group_name
,
task
=
task
task_score
=
task
.
aggregation
()[
metric
](
items
)
else
:
results
[
task_name
][
metric
+
","
+
key
]
=
task_score
group_name
=
None
# Need to put back in results
agg_fn
=
task
.
aggregation
()[
metric
]
# pythia | acc
task_score
=
agg_fn
(
items
)
# | perplexity
# | word_perplexity
if
group_name
is
not
None
:
# | byte_perplexity
sample_metric_key
=
metric
+
"(sample agg),"
+
key
# | bits_per_byte
for
grouping
in
task_to_group
[
task_name
]:
if
task_name
in
task_groups
:
if
metric_key
in
results
[
grouping
]:
group_name
=
task_groups
[
task_name
]
results
[
grouping
][
metric_key
].
append
(
task_score
)
if
metric
in
list
(
aggregate
[
group_name
].
keys
()):
else
:
aggregate
[
group_name
][
metric
].
append
(
task_score
)
results
[
grouping
][
metric_key
]
=
[
task_score
]
if
sample_metric_key
in
results
[
grouping
]:
results
[
grouping
][
sample_metric_key
]
+=
items
else
:
else
:
aggregate
[
group_name
][
metric
]
=
[
task_score
]
results
[
grouping
][
sample_metric_key
]
=
items
.
copy
()
sample_agg_fn
[
grouping
][
sample_metric_key
]
=
agg_fn
results
[
task_name
][
metric_key
]
=
task_score
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
# so we run them less iterations. still looking for a cleaner way to do this
...
@@ -431,19 +481,38 @@ def evaluate(
...
@@ -431,19 +481,38 @@ def evaluate(
if
stderr
is
not
None
:
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
if
bool
(
aggregate
):
if
bool
(
results
):
for
group
in
aggregate
.
keys
():
for
task_or_group
in
results
.
keys
():
for
metric
in
aggregate
[
group
].
keys
():
for
metric
in
results
[
task_or_group
].
keys
():
aggregate
[
group
][
metric
]
=
np
.
average
(
aggregate
[
group
][
metric
])
if
type
(
results
[
task_or_group
][
metric
])
==
list
:
versions
[
group
]
=
"N/A"
if
"(sample agg)"
in
metric
:
results
[
task_or_group
][
metric
]
=
sample_agg_fn
[
task_or_group
][
metric
](
results
[
task_or_group
][
metric
])
else
:
results
[
task_or_group
][
metric
]
=
np
.
average
(
results
[
task_or_group
][
metric
]
)
versions
[
task_or_group
]
=
"N/A"
for
task_name
,
task
in
task_dict
.
items
():
if
type
(
task
)
==
tuple
:
group_name
,
task
=
task
order
=
task_order
[
group_name
]
tabbed_name
=
"-"
*
order
+
group_name
results_agg
[
tabbed_name
]
=
results
[
group_name
]
versions
[
tabbed_name
]
=
versions
[
group_name
]
if
order
==
0
:
groups_agg
[
group_name
]
=
results
[
group_name
]
order
=
task_order
[
task_name
]
tabbed_name
=
"-"
*
order
+
task_name
results_agg
[
tabbed_name
]
=
results
[
task_name
]
versions
[
tabbed_name
]
=
versions
[
task_name
]
results_dict
=
{
results_dict
=
{
"results"
:
dict
(
sorted
(
results
.
items
())),
"results"
:
dict
(
results_agg
.
items
()),
**
(
**
({
"groups"
:
dict
(
groups_agg
.
items
())}
if
bool
(
groups_agg
)
else
{}),
{
"aggregate"
:
dict
(
sorted
(
aggregate
.
items
()))}
if
bool
(
aggregate
)
else
{}
),
"configs"
:
dict
(
sorted
(
configs
.
items
())),
"configs"
:
dict
(
sorted
(
configs
.
items
())),
"versions"
:
dict
(
sorted
(
versions
.
items
())),
"versions"
:
dict
(
sorted
(
versions
.
items
())),
}
}
...
...
lm_eval/models/huggingface.py
View file @
1f351067
...
@@ -107,17 +107,20 @@ class HFLM(LM):
...
@@ -107,17 +107,20 @@ class HFLM(LM):
if
not
(
parallelize
or
accelerator
.
num_processes
>
1
):
if
not
(
parallelize
or
accelerator
.
num_processes
>
1
):
# use user-passed device
# use user-passed device
device_list
=
set
(
device_list
=
set
(
[
"cuda"
,
"cpu"
,
"mps"
]
[
"cuda"
,
"cpu"
]
+
[
f
"cuda:
{
i
}
"
for
i
in
range
(
torch
.
cuda
.
device_count
())]
+
[
f
"cuda:
{
i
}
"
for
i
in
range
(
torch
.
cuda
.
device_count
())]
+
[
"mps"
,
"mps:0"
]
)
)
if
device
:
if
device
:
if
device
not
in
device_list
:
if
device
not
in
device_list
:
device
=
int
(
device
)
device
=
int
(
device
)
self
.
_device
=
torch
.
device
(
device
)
self
.
_device
=
torch
.
device
(
device
)
eval_logger
.
info
(
f
"Using device '
{
device
}
'"
)
eval_logger
.
info
(
f
"Using device '
{
device
}
'"
)
if
device
==
"mps"
:
if
device
in
(
"mps"
,
"mps:0"
)
and
"dev"
not
in
torch
.
__version__
:
eval_logger
.
info
(
eval_logger
.
info
(
"MPS is still in beta and only supports float32; setting dtype to float32."
"MPS: Setting dtype to float32. To use float16 with MPS, please install a nightly build of "
"PyTorch: pip3 install --pre torch torchvision torchaudio --index-url "
"https://download.pytorch.org/whl/nightly/cpu"
)
)
else
:
else
:
eval_logger
.
info
(
"Device not specified"
)
eval_logger
.
info
(
"Device not specified"
)
...
...
lm_eval/prompts/__init__.py
View file @
1f351067
import
ast
from
typing
import
Dict
from
lm_eval
import
utils
from
lm_eval
import
utils
from
lm_eval.logger
import
eval_logger
from
lm_eval.logger
import
eval_logger
...
@@ -5,7 +8,7 @@ from lm_eval.logger import eval_logger
...
@@ -5,7 +8,7 @@ from lm_eval.logger import eval_logger
# Stores prompts in a dictionary indexed by 2 levels:
# Stores prompts in a dictionary indexed by 2 levels:
# prompt category name, and prompt name.
# prompt category name, and prompt name.
# This allows us to access prompts
# This allows us to access prompts
PROMPT_REGISTRY
:
d
ict
[
str
,
d
ict
[
str
,
str
]]
=
{
PROMPT_REGISTRY
:
D
ict
[
str
,
D
ict
[
str
,
str
]]
=
{
"qa-basic"
:
{
"qa-basic"
:
{
"question-newline-answer"
:
"Question: {{question}}
\n
Answer:"
,
"question-newline-answer"
:
"Question: {{question}}
\n
Answer:"
,
"q-newline-a"
:
"Q: {{question}}
\n
A:"
,
"q-newline-a"
:
"Q: {{question}}
\n
A:"
,
...
@@ -63,6 +66,12 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwa
...
@@ -63,6 +66,12 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwa
else
:
else
:
prompts
=
DatasetTemplates
(
dataset_name
=
dataset_name
,
subset_name
=
subset_name
)
prompts
=
DatasetTemplates
(
dataset_name
=
dataset_name
,
subset_name
=
subset_name
)
category_name
,
prompt_name
=
use_prompt
.
split
(
":"
)
category_name
,
*
prompt_name
=
use_prompt
.
split
(
":"
)
# TODO allow to multiple prompt naming
# if len(prompt_name) > 1:
# prompt_list = []
# for prompt in prompt_name:
# prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
# else:
prompt_list
=
utils
.
pattern_match
(
prompt_name
,
prompts
.
all_template_names
)
prompt_list
=
utils
.
pattern_match
(
prompt_name
,
prompts
.
all_template_names
)
return
[
":"
.
join
([
category_name
,
prompt
])
for
prompt
in
prompt_list
]
return
[
":"
.
join
([
category_name
,
prompt
])
for
prompt
in
prompt_list
]
lm_eval/tasks/__init__.py
View file @
1f351067
import
os
import
os
import
yaml
import
yaml
from
typing
import
List
,
Union
from
typing
import
List
,
Union
,
Dict
from
lm_eval
import
utils
from
lm_eval
import
utils
from
lm_eval
import
prompts
from
lm_eval
import
prompts
...
@@ -15,7 +15,7 @@ from lm_eval.api.registry import (
...
@@ -15,7 +15,7 @@ from lm_eval.api.registry import (
)
)
def
register_configurable_task
(
config
:
d
ict
[
str
,
str
])
->
int
:
def
register_configurable_task
(
config
:
D
ict
[
str
,
str
])
->
int
:
SubClass
=
type
(
SubClass
=
type
(
config
[
"task"
]
+
"ConfigurableTask"
,
config
[
"task"
]
+
"ConfigurableTask"
,
(
ConfigurableTask
,),
(
ConfigurableTask
,),
...
@@ -38,7 +38,7 @@ def register_configurable_task(config: dict[str, str]) -> int:
...
@@ -38,7 +38,7 @@ def register_configurable_task(config: dict[str, str]) -> int:
return
0
return
0
def
check_prompt_config
(
config
:
d
ict
[
str
,
str
])
->
List
[
d
ict
[
str
,
str
]]:
def
check_prompt_config
(
config
:
D
ict
[
str
,
str
])
->
List
[
D
ict
[
str
,
str
]]:
all_configs
=
[]
all_configs
=
[]
if
"use_prompt"
in
config
:
if
"use_prompt"
in
config
:
prompt_list
=
prompts
.
load_prompt_list
(
prompt_list
=
prompts
.
load_prompt_list
(
...
@@ -69,7 +69,7 @@ def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
...
@@ -69,7 +69,7 @@ def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
return
all_configs
return
all_configs
def
get_task_name_from_config
(
task_config
:
d
ict
[
str
,
str
])
->
str
:
def
get_task_name_from_config
(
task_config
:
D
ict
[
str
,
str
])
->
str
:
if
"dataset_name"
in
task_config
:
if
"dataset_name"
in
task_config
:
return
"{dataset_path}_{dataset_name}"
.
format
(
**
task_config
)
return
"{dataset_path}_{dataset_name}"
.
format
(
**
task_config
)
else
:
else
:
...
@@ -128,7 +128,7 @@ def get_task_name_from_object(task_object):
...
@@ -128,7 +128,7 @@ def get_task_name_from_object(task_object):
# TODO: pass num_fewshot and other cmdline overrides in a better way
# TODO: pass num_fewshot and other cmdline overrides in a better way
def
get_task_dict
(
task_name_list
:
List
[
Union
[
str
,
d
ict
,
Task
]],
**
kwargs
):
def
get_task_dict
(
task_name_list
:
List
[
Union
[
str
,
D
ict
,
Task
]],
**
kwargs
):
config
=
{
**
kwargs
}
config
=
{
**
kwargs
}
...
@@ -136,6 +136,9 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
...
@@ -136,6 +136,9 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
task_name_from_config_dict
=
{}
task_name_from_config_dict
=
{}
task_name_from_object_dict
=
{}
task_name_from_object_dict
=
{}
if
type
(
task_name_list
)
!=
list
:
task_name_list
=
[
task_name_list
]
for
task_element
in
task_name_list
:
for
task_element
in
task_name_list
:
if
isinstance
(
task_element
,
str
):
if
isinstance
(
task_element
,
str
):
...
@@ -143,12 +146,20 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
...
@@ -143,12 +146,20 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
group_name
=
task_element
group_name
=
task_element
for
task_name
in
GROUP_REGISTRY
[
task_element
]:
for
task_name
in
GROUP_REGISTRY
[
task_element
]:
if
task_name
not
in
task_name_from_registry_dict
:
if
task_name
not
in
task_name_from_registry_dict
:
task_obj
=
get_task_dict
(
task_name
)
if
task_name
in
task_obj
.
keys
():
task_dict
=
{
task_name
:
(
group_name
,
task_obj
[
task_name
]),
}
else
:
task_dict
=
{
task_name
:
(
group_name
,
None
),
**
task_obj
,
}
task_name_from_registry_dict
=
{
task_name_from_registry_dict
=
{
**
task_name_from_registry_dict
,
**
task_name_from_registry_dict
,
task_name
:
(
**
task_dict
,
group_name
,
get_task
(
task_name
=
task_name
,
config
=
config
),
),
}
}
else
:
else
:
task_name
=
task_element
task_name
=
task_element
...
...
lm_eval/tasks/nq_open/README.md
0 → 100644
View file @
1f351067
lm_eval/tasks/nq_open/nq_open.yaml
0 → 100644
View file @
1f351067
task
:
nq_open
dataset_path
:
nq_open
output_type
:
greedy_until
training_split
:
train
validation_split
:
validation
description
:
"
Answer
these
questions:
\n
"
doc_to_text
:
"
Q:
{{question}}?
\n
A:"
doc_to_target
:
"
{{answer}}"
# TODO: should be multi-target
fewshot_delimiter
:
"
\n
"
generation_kwargs
:
until
:
-
"
\n
"
-
"
."
-
"
,"
do_sample
:
false
temperature
:
0.0
filter_list
:
-
name
:
remove_whitespace
filter
:
-
function
:
remove_whitespace
-
function
:
take_first
target_delimiter
:
"
"
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
higher_is_better
:
true
ignore_case
:
true
ignore_punctuation
:
true
regexes_to_ignore
:
-
"
\b
an|a|the
\b
"
lm_eval/tasks/translation/utils.py
View file @
1f351067
...
@@ -10,7 +10,7 @@ try:
...
@@ -10,7 +10,7 @@ try:
except
ModuleNotFoundError
:
except
ModuleNotFoundError
:
raise
Exception
(
raise
Exception
(
"`pycountry` is required for generating translation task prompt templates.
\
"`pycountry` is required for generating translation task prompt templates.
\
please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]"
,
please install pycountry via pip install lm-eval[multilingua
l
] or pip install -e .[multilingual]"
,
)
)
...
...
lm_eval/utils.py
View file @
1f351067
...
@@ -16,7 +16,6 @@ import gc
...
@@ -16,7 +16,6 @@ import gc
import
torch
import
torch
import
transformers
import
transformers
from
omegaconf
import
OmegaConf
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
from
itertools
import
islice
from
itertools
import
islice
...
@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
...
@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
args_string
=
args_string
.
strip
()
args_string
=
args_string
.
strip
()
if
not
args_string
:
if
not
args_string
:
return
{}
return
{}
arg_list
=
args_string
.
split
(
","
)
arg_list
=
[
arg
for
arg
in
args_string
.
split
(
","
)
if
arg
]
args_dict
=
OmegaConf
.
to_object
(
OmegaConf
.
from_dot
li
s
t
(
arg_list
))
args_dict
=
{
k
:
v
for
k
,
v
in
[
arg
.
sp
lit
(
"="
)
for
arg
in
arg_list
]}
return
args_dict
return
args_dict
...
@@ -267,9 +266,9 @@ def make_table(result_dict, column: str = "results"):
...
@@ -267,9 +266,9 @@ def make_table(result_dict, column: str = "results"):
from
pytablewriter
import
MarkdownTableWriter
,
LatexTableWriter
from
pytablewriter
import
MarkdownTableWriter
,
LatexTableWriter
if
column
==
"results"
:
if
column
==
"results"
:
column_name
=
"Task"
column_name
=
"Task
s
"
elif
column
==
"
aggregate
"
:
elif
column
==
"
groups
"
:
column_name
=
"
Benchmark
"
column_name
=
"
Groups
"
md_writer
=
MarkdownTableWriter
()
md_writer
=
MarkdownTableWriter
()
latex_writer
=
LatexTableWriter
()
latex_writer
=
LatexTableWriter
()
...
...
main.py
View file @
1f351067
...
@@ -209,8 +209,8 @@ def main() -> None:
...
@@ -209,8 +209,8 @@ def main() -> None:
f
"batch_size:
{
args
.
batch_size
}{
f
' (
{
batch_sizes
}
)
' if batch_sizes else ''
}
"
f
"batch_size:
{
args
.
batch_size
}{
f
' (
{
batch_sizes
}
)
' if batch_sizes else ''
}
"
)
)
print
(
evaluator
.
make_table
(
results
))
print
(
evaluator
.
make_table
(
results
))
if
"
aggregate
"
in
results
:
if
"
groups
"
in
results
:
print
(
evaluator
.
make_table
(
results
,
"
aggregate
"
))
print
(
evaluator
.
make_table
(
results
,
"
groups
"
))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
pyproject.toml
View file @
1f351067
[build-system]
[build-system]
requires
=
[
"setuptools>=40.8.0"
,
"wheel"
]
requires
=
[
"setuptools>=40.8.0"
,
"wheel"
]
build-backend
=
"setuptools.build_meta"
build-backend
=
"setuptools.build_meta"
[project]
name
=
"lm_eval"
version
=
"1.0.0"
authors
=
[
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
]
description
=
"A framework for evaluating language models"
readme
=
"README.md"
classifiers
=
[
"Development Status :: 3 - Alpha"
,
"Programming Language :: Python :: 3"
,
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
]
requires-python
=
">=3.9"
license
=
{
"text"
=
"MIT"
}
dependencies
=
[
"accelerate>=0.21.0"
,
"evaluate"
,
"datasets>=2.0.0"
,
"evaluate>=0.4.0"
,
"jsonlines"
,
"numexpr"
,
"peft>=0.2.0"
,
"pybind11>=2.6.2"
,
"pytablewriter"
,
"rouge-score>=0.0.4"
,
"sacrebleu>=1.5.0"
,
"scikit-learn>=0.24.1"
,
"sqlitedict"
,
"torch>=1.8"
,
"tqdm-multiprocess"
,
"transformers>=4.1"
,
"zstandard"
,
]
[tool.setuptools]
packages
=
["lm_eval"]
# required to include yaml files in pip installation
[tool.setuptools.package-data]
lm_eval
=
[
"**/*.yaml"
,
"tasks/**/*"
]
examples
=
["**/*.yaml"]
[project.scripts]
lm-eval
=
"main:main"
lm_eval
=
"main:main"
[project.urls]
Homepage
=
"https://github.com/EleutherAI/lm-evaluation-harness"
Repository
=
"https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
dev
=
[
"black"
,
"flake8"
,
"pre-commit"
,
"pytest"
,
"pytest-cov"
]
linting
=
[
"flake8"
,
"pylint"
,
"mypy"
,
"pre-commit"
,
]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
multilingual
=
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
,
"pycountry"
]
sentencepiece
=
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
]
promptsource
=
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
]
gptq
=
["auto-gptq[triton]
@
git+https://github.com/PanQiWei/AutoGPTQ
"]
anthropic
=
["anthropic"]
openai
=
[
"openai"
,
"tiktoken"
]
all
=
[
"lm_eval[dev]"
,
"lm_eval[testing]"
,
"lm_eval[linting]"
,
"lm_eval[multilingual]"
,
"lm_eval[sentencepiece]"
,
"lm_eval[promptsource]"
,
"lm_eval[gptq]"
,
"lm_eval[anthropic]"
,
"lm_eval[openai]"
]
scripts/write_out.py
View file @
1f351067
...
@@ -38,12 +38,14 @@ def main():
...
@@ -38,12 +38,14 @@ def main():
iters
=
[]
iters
=
[]
for
set
in
args
.
sets
.
split
(
","
):
for
set
in
args
.
sets
.
split
(
","
):
docs
=
None
if
set
==
"train"
and
task
.
has_training_docs
():
if
set
==
"train"
and
task
.
has_training_docs
():
docs
=
task
.
training_docs
()
docs
=
task
.
training_docs
()
if
set
==
"val"
and
task
.
has_validation_docs
():
if
set
==
"val"
and
task
.
has_validation_docs
():
docs
=
task
.
validation_docs
()
docs
=
task
.
validation_docs
()
if
set
==
"test"
and
task
.
has_test_docs
():
if
set
==
"test"
and
task
.
has_test_docs
():
docs
=
task
.
test_docs
()
docs
=
task
.
test_docs
()
if
docs
is
not
None
:
iters
.
append
(
docs
)
iters
.
append
(
docs
)
docs
=
join_iters
(
iters
)
docs
=
join_iters
(
iters
)
...
...
setup.py
View file @
1f351067
import
setuptools
import
setuptools
import
itertools
with
open
(
"README.md"
,
"r"
,
encoding
=
"utf-8"
)
as
fh
:
# This is to make sure that the package supports editable installs
long_description
=
fh
.
read
()
setuptools
.
setup
()
extras_require
=
{
"dev"
:
[
"black"
,
"flake8"
,
"pre-commit"
,
"pytest"
,
"pytest-cov"
],
"linting"
:
[
"flake8"
,
"pylint"
,
"mypy"
,
"pre-commit"
,
],
"testing"
:
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
],
"multilingual"
:
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
],
"sentencepiece"
:
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
,
"pycountry"
],
"promptsource"
:
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"anthropic"
:
[
"anthropic"
],
"openai"
:
[
"openai"
,
"tiktoken"
],
}
extras_require
[
"all"
]
=
list
(
itertools
.
chain
.
from_iterable
(
extras_require
.
values
()))
setuptools
.
setup
(
name
=
"lm_eval"
,
version
=
"1.0.0"
,
author
=
"EleutherAI"
,
author_email
=
"contact@eleuther.ai"
,
description
=
"A framework for evaluating language models"
,
long_description
=
long_description
,
long_description_content_type
=
"text/markdown"
,
url
=
"https://github.com/EleutherAI/lm-evaluation-harness"
,
packages
=
setuptools
.
find_packages
(),
# required to include yaml files in pip installation
package_data
=
{
"lm_eval"
:
[
"**/*.yaml"
,
"tasks/**/*"
],
"examples"
:
[
"**/*.yaml"
],
},
entry_points
=
{
"console_scripts"
:
[
"lm-eval = main:main"
,
"lm_eval = main:main"
],
},
include_package_data
=
True
,
classifiers
=
[
"Development Status :: 3 - Alpha"
,
"Programming Language :: Python :: 3"
,
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
],
python_requires
=
">=3.9"
,
install_requires
=
[
"accelerate>=0.21.0"
,
"evaluate"
,
"datasets>=2.0.0"
,
"evaluate>=0.4.0"
,
"jsonlines"
,
"numexpr"
,
"omegaconf>=2.2"
,
"peft>=0.2.0"
,
"pybind11>=2.6.2"
,
"pytablewriter"
,
"rouge-score>=0.0.4"
,
"sacrebleu>=1.5.0"
,
"scikit-learn>=0.24.1"
,
"sqlitedict"
,
"torch>=1.8"
,
"tqdm-multiprocess"
,
"transformers>=4.1"
,
"zstandard"
,
],
extras_require
=
extras_require
,
)
tests/test_evaluator.py
View file @
1f351067
...
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
...
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
# import lm_eval.models as models
# import lm_eval.models as models
import
lm_eval.api
as
api
import
lm_eval.api
as
api
import
lm_eval.evaluator
as
evaluator
import
lm_eval.evaluator
as
evaluator
from
typing
import
List
import
random
import
random
import
pytest
import
pytest
...
@@ -26,7 +27,7 @@ import pytest
...
@@ -26,7 +27,7 @@ import pytest
)
)
],
],
)
)
def
test_evaluator
(
task_name
:
l
ist
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
):
def
test_evaluator
(
task_name
:
L
ist
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
):
task_name
=
task_name
task_name
=
task_name
limit
=
10
limit
=
10
...
...
tests/utils.py
View file @
1f351067
...
@@ -9,6 +9,7 @@ import os
...
@@ -9,6 +9,7 @@ import os
# This is the path where the output for the changed files for the tasks folder is stored
# This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# reads a text file and returns a list of words
# reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files
# used to read the output of the changed txt from tj-actions/changed-files
def
load_changed_files
(
file_path
:
str
)
->
List
[
str
]:
def
load_changed_files
(
file_path
:
str
)
->
List
[
str
]:
...
@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]:
...
@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]:
return
list
(
_output
)
return
list
(
_output
)
def
new_tasks
()
->
Union
[
l
ist
[
str
],
None
]:
def
new_tasks
()
->
Union
[
L
ist
[
str
],
None
]:
FILENAME
=
".github/outputs/tasks_all_changed_and_modified_files.txt"
FILENAME
=
".github/outputs/tasks_all_changed_and_modified_files.txt"
if
os
.
path
.
exists
(
FILENAME
):
if
os
.
path
.
exists
(
FILENAME
):
# If tasks folder has changed then we get the list of files from FILENAME
# If tasks folder has changed then we get the list of files from FILENAME
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment