Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
3263c572
Commit
3263c572
authored
Sep 18, 2023
by
lintangsutawika
Browse files
Merge branch 'big-refactor' of
https://github.com/EleutherAI/lm-evaluation-harness
into squadv2
parents
a27e8ed1
33d52483
Changes
114
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
348 additions
and
374 deletions
+348
-374
lm_eval/tasks/wsc273/default.yaml
lm_eval/tasks/wsc273/default.yaml
+15
-0
lm_eval/tasks/wsc273/utils.py
lm_eval/tasks/wsc273/utils.py
+36
-0
lm_eval/utils.py
lm_eval/utils.py
+16
-20
main.py
main.py
+23
-9
mypy.ini
mypy.ini
+29
-0
pyproject.toml
pyproject.toml
+84
-0
scripts/write_out.py
scripts/write_out.py
+3
-1
setup.py
setup.py
+2
-75
tests/extra/test_new_tasks.py
tests/extra/test_new_tasks.py
+0
-129
tests/extra/test_utils.py
tests/extra/test_utils.py
+0
-23
tests/models/test_huggingface.py
tests/models/test_huggingface.py
+14
-3
tests/test_evaluator.py
tests/test_evaluator.py
+2
-1
tests/test_tasks.py
tests/test_tasks.py
+105
-110
tests/utils.py
tests/utils.py
+19
-3
No files found.
lm_eval/tasks/wsc273/default.yaml
0 → 100644
View file @
3263c572
task
:
wsc273
dataset_path
:
winograd_wsc
dataset_name
:
wsc273
output_type
:
multiple_choice
test_split
:
test
doc_to_text
:
label
process_docs
:
!function
utils.process_doc
doc_to_target
:
"
{%
set
index
=
pronoun_loc
+
pronoun
|
length
%}{{text[index:]}}"
doc_to_choice
:
"
{%
set
template
=
text[:pronoun_loc]
%}{{[template+options[0],
template+options[1]]}}"
should_decontaminate
:
true
doc_to_decontamination_query
:
text
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
lm_eval/tasks/wsc273/utils.py
0 → 100644
View file @
3263c572
upper_pronouns
=
[
"A"
,
"An"
,
"The"
,
"She"
,
"He"
,
"It"
,
"They"
,
"My"
,
"His"
,
"Her"
,
"Their"
,
]
def
process_doc
(
dataset
):
def
process_fn
(
doc
):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
doc
[
"text"
]
=
doc
[
"text"
].
replace
(
" "
,
" "
)
doc
[
"options"
][
0
]
=
__normalize_option
(
doc
,
doc
[
"options"
][
0
])
doc
[
"options"
][
1
]
=
__normalize_option
(
doc
,
doc
[
"options"
][
1
])
return
doc
return
dataset
.
map
(
process_fn
)
def
__normalize_option
(
doc
,
option
):
# Append `'s` to possessive determiner based options.
if
doc
[
"pronoun"
].
lower
()
in
[
"my"
,
"his"
,
"her"
,
"our"
,
"their"
]:
option
+=
"'s"
# Appropriately lowercase the pronoun in the option.
pronoun
=
option
.
split
()[
0
]
start_of_sentence
=
doc
[
"text"
][
doc
[
"pronoun_loc"
]
-
2
]
==
"."
if
not
start_of_sentence
and
pronoun
in
upper_pronouns
:
return
option
.
replace
(
pronoun
,
pronoun
.
lower
())
return
option
lm_eval/utils.py
View file @
3263c572
...
@@ -10,13 +10,12 @@ import collections
...
@@ -10,13 +10,12 @@ import collections
import
importlib.util
import
importlib.util
import
fnmatch
import
fnmatch
from
typing
import
List
,
Literal
,
Union
from
typing
import
Iterator
,
List
,
Literal
,
Union
import
gc
import
gc
import
torch
import
torch
import
transformers
import
transformers
from
omegaconf
import
OmegaConf
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
from
itertools
import
islice
from
itertools
import
islice
...
@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
...
@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
args_string
=
args_string
.
strip
()
args_string
=
args_string
.
strip
()
if
not
args_string
:
if
not
args_string
:
return
{}
return
{}
arg_list
=
args_string
.
split
(
","
)
arg_list
=
[
arg
for
arg
in
args_string
.
split
(
","
)
if
arg
]
args_dict
=
OmegaConf
.
to_object
(
OmegaConf
.
from_dot
li
s
t
(
arg_list
))
args_dict
=
{
k
:
v
for
k
,
v
in
[
arg
.
sp
lit
(
"="
)
for
arg
in
arg_list
]}
return
args_dict
return
args_dict
...
@@ -65,7 +64,7 @@ def join_iters(iters):
...
@@ -65,7 +64,7 @@ def join_iters(iters):
yield
from
iter
yield
from
iter
def
chunks
(
iter
,
n
=
0
,
fn
=
None
):
def
chunks
(
iter
,
n
:
int
=
0
,
fn
=
None
):
arr
=
[]
arr
=
[]
for
i
,
x
in
enumerate
(
iter
):
for
i
,
x
in
enumerate
(
iter
):
arr
.
append
(
x
)
arr
.
append
(
x
)
...
@@ -87,11 +86,11 @@ def group(arr, fn):
...
@@ -87,11 +86,11 @@ def group(arr, fn):
class
MultiChoice
:
class
MultiChoice
:
def
__init__
(
self
,
choices
):
def
__init__
(
self
,
choices
)
->
None
:
self
.
choices
=
choices
self
.
choices
=
choices
# Simple wildcard support (linux filename patterns)
# Simple wildcard support (linux filename patterns)
def
__contains__
(
self
,
values
):
def
__contains__
(
self
,
values
)
->
bool
:
for
value
in
values
.
split
(
","
):
for
value
in
values
.
split
(
","
):
if
len
(
fnmatch
.
filter
(
self
.
choices
,
value
))
==
0
:
if
len
(
fnmatch
.
filter
(
self
.
choices
,
value
))
==
0
:
eval_logger
.
info
(
f
"Available tasks to choose:"
)
eval_logger
.
info
(
f
"Available tasks to choose:"
)
...
@@ -100,7 +99,7 @@ class MultiChoice:
...
@@ -100,7 +99,7 @@ class MultiChoice:
raise
ValueError
(
"'{}' is not in task list"
.
format
(
value
))
raise
ValueError
(
"'{}' is not in task list"
.
format
(
value
))
return
True
return
True
def
__iter__
(
self
):
def
__iter__
(
self
)
->
Iterator
:
for
choice
in
self
.
choices
:
for
choice
in
self
.
choices
:
yield
choice
yield
choice
...
@@ -108,7 +107,6 @@ class MultiChoice:
...
@@ -108,7 +107,6 @@ class MultiChoice:
# Returns a list containing all values of the source_list that
# Returns a list containing all values of the source_list that
# match at least one of the patterns
# match at least one of the patterns
def
pattern_match
(
patterns
,
source_list
):
def
pattern_match
(
patterns
,
source_list
):
if
type
(
patterns
)
==
str
:
if
type
(
patterns
)
==
str
:
patterns
=
[
patterns
]
patterns
=
[
patterns
]
...
@@ -177,7 +175,7 @@ def make_disjoint_window(pair):
...
@@ -177,7 +175,7 @@ def make_disjoint_window(pair):
class
Reorderer
:
class
Reorderer
:
def
__init__
(
self
,
arr
,
fn
):
def
__init__
(
self
,
arr
,
fn
)
->
None
:
self
.
size
=
len
(
arr
)
self
.
size
=
len
(
arr
)
arr
=
list
(
enumerate
(
arr
))
arr
=
list
(
enumerate
(
arr
))
arr
=
group
(
arr
,
lambda
x
:
fn
(
x
[
1
]))
arr
=
group
(
arr
,
lambda
x
:
fn
(
x
[
1
]))
...
@@ -212,7 +210,7 @@ class Grouper:
...
@@ -212,7 +210,7 @@ class Grouper:
objects in `arr` satisfying `key == fn(ob)`.
objects in `arr` satisfying `key == fn(ob)`.
"""
"""
def
__init__
(
self
,
arr
,
fn
):
def
__init__
(
self
,
arr
,
fn
)
->
None
:
# self.orig_arr = arr
# self.orig_arr = arr
self
.
size
=
len
(
arr
)
self
.
size
=
len
(
arr
)
arr
=
list
(
enumerate
(
arr
))
arr
=
list
(
enumerate
(
arr
))
...
@@ -263,14 +261,14 @@ class Grouper:
...
@@ -263,14 +261,14 @@ class Grouper:
return
res
return
res
def
make_table
(
result_dict
,
column
=
"results"
):
def
make_table
(
result_dict
,
column
:
str
=
"results"
):
"""Generate table of results."""
"""Generate table of results."""
from
pytablewriter
import
MarkdownTableWriter
,
LatexTableWriter
from
pytablewriter
import
MarkdownTableWriter
,
LatexTableWriter
if
column
==
"results"
:
if
column
==
"results"
:
column_name
=
"Task"
column_name
=
"Task
s
"
elif
column
==
"
aggregate
"
:
elif
column
==
"
groups
"
:
column_name
=
"
Benchmark
"
column_name
=
"
Groups
"
md_writer
=
MarkdownTableWriter
()
md_writer
=
MarkdownTableWriter
()
latex_writer
=
LatexTableWriter
()
latex_writer
=
LatexTableWriter
()
...
@@ -393,7 +391,6 @@ def get_git_commit_hash():
...
@@ -393,7 +391,6 @@ def get_git_commit_hash():
def
import_function
(
loader
,
node
):
def
import_function
(
loader
,
node
):
function_name
=
loader
.
construct_scalar
(
node
)
function_name
=
loader
.
construct_scalar
(
node
)
yaml_path
=
os
.
path
.
dirname
(
loader
.
name
)
yaml_path
=
os
.
path
.
dirname
(
loader
.
name
)
...
@@ -428,7 +425,6 @@ def load_yaml_config(yaml_path):
...
@@ -428,7 +425,6 @@ def load_yaml_config(yaml_path):
include_path
.
reverse
()
include_path
.
reverse
()
final_yaml_config
=
{}
final_yaml_config
=
{}
for
path
in
include_path
:
for
path
in
include_path
:
# Assumes that path is a full path.
# Assumes that path is a full path.
# If not found, assume the included yaml
# If not found, assume the included yaml
# is in the same dir as the original yaml
# is in the same dir as the original yaml
...
@@ -447,7 +443,7 @@ def load_yaml_config(yaml_path):
...
@@ -447,7 +443,7 @@ def load_yaml_config(yaml_path):
return
yaml_config
return
yaml_config
def
regex_replace
(
string
,
pattern
,
repl
,
count
=
0
):
def
regex_replace
(
string
,
pattern
,
repl
,
count
:
int
=
0
):
"""Implements the `re.sub` function as a custom Jinja filter."""
"""Implements the `re.sub` function as a custom Jinja filter."""
return
re
.
sub
(
pattern
,
repl
,
string
,
count
=
count
)
return
re
.
sub
(
pattern
,
repl
,
string
,
count
=
count
)
...
@@ -521,7 +517,7 @@ def pad_and_concat(
...
@@ -521,7 +517,7 @@ def pad_and_concat(
return
torch
.
cat
(
tensors
,
dim
=
0
)
return
torch
.
cat
(
tensors
,
dim
=
0
)
def
clear_torch_cache
():
def
clear_torch_cache
()
->
None
:
gc
.
collect
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
...
@@ -546,7 +542,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
...
@@ -546,7 +542,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
tokenizer
:
transformers
.
PreTrainedTokenizer
,
tokenizer
:
transformers
.
PreTrainedTokenizer
,
initial_decoder_input_length
:
int
,
initial_decoder_input_length
:
int
,
batch_size
:
int
,
batch_size
:
int
,
):
)
->
None
:
self
.
initial_decoder_input_length
=
initial_decoder_input_length
self
.
initial_decoder_input_length
=
initial_decoder_input_length
self
.
done_tracker
=
[
False
]
*
batch_size
self
.
done_tracker
=
[
False
]
*
batch_size
self
.
sequence
=
sequence
self
.
sequence
=
sequence
...
...
main.py
View file @
3263c572
...
@@ -9,23 +9,26 @@ from pathlib import Path
...
@@ -9,23 +9,26 @@ from pathlib import Path
from
lm_eval
import
evaluator
,
utils
from
lm_eval
import
evaluator
,
utils
from
lm_eval.api.registry
import
ALL_TASKS
from
lm_eval.api.registry
import
ALL_TASKS
from
lm_eval.logger
import
eval_logger
from
lm_eval.logger
import
eval_logger
,
SPACING
from
lm_eval.tasks
import
include_task_folder
from
lm_eval.tasks
import
include_task_folder
from
lm_eval.benchmarks
import
include_benchmarks
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
def
parse_args
():
def
parse_args
()
->
argparse
.
Namespace
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
RawTextHelpFormatter
)
parser
.
add_argument
(
"--model"
,
required
=
True
,
help
=
"Name of model e.g. `hf`"
)
parser
.
add_argument
(
"--model"
,
required
=
True
,
help
=
"Name of model e.g. `hf`"
)
parser
.
add_argument
(
"--tasks"
,
default
=
None
,
help
=
"Available Tasks:
\n
- {}"
.
format
(
"
\n
- "
.
join
(
sorted
(
ALL_TASKS
))),
)
parser
.
add_argument
(
parser
.
add_argument
(
"--model_args"
,
"--model_args"
,
default
=
""
,
default
=
""
,
help
=
"String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`"
,
help
=
"String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`"
,
)
)
parser
.
add_argument
(
"--tasks"
,
default
=
None
# , choices=utils.MultiChoice(sorted(ALL_TASKS))
)
parser
.
add_argument
(
parser
.
add_argument
(
"--num_fewshot"
,
"--num_fewshot"
,
type
=
int
,
type
=
int
,
...
@@ -98,7 +101,7 @@ def parse_args():
...
@@ -98,7 +101,7 @@ def parse_args():
return
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
():
def
main
()
->
None
:
args
=
parse_args
()
args
=
parse_args
()
if
args
.
limit
:
if
args
.
limit
:
...
@@ -125,10 +128,21 @@ def main():
...
@@ -125,10 +128,21 @@ def main():
else
:
else
:
tasks_list
=
args
.
tasks
.
split
(
","
)
tasks_list
=
args
.
tasks
.
split
(
","
)
task_names
=
utils
.
pattern_match
(
tasks_list
,
ALL_TASKS
)
task_names
=
utils
.
pattern_match
(
tasks_list
,
ALL_TASKS
)
task_missing
=
[]
for
task
in
[
task
for
task
in
tasks_list
if
task
not
in
task_names
]:
for
task
in
[
task
for
task
in
tasks_list
if
task
not
in
task_names
]:
if
os
.
path
.
isfile
(
task
):
if
os
.
path
.
isfile
(
task
):
config
=
utils
.
load_yaml_config
(
task
)
config
=
utils
.
load_yaml_config
(
task
)
task_names
.
append
(
config
)
task_names
.
append
(
config
)
else
:
task_missing
.
append
(
task
)
if
task_missing
!=
[]:
missing
=
", "
.
join
(
task_missing
)
eval_logger
.
error
(
f
"Tasks were not found:
{
missing
}
\n
"
f
"
{
SPACING
}
Try `lm-eval -h` for list of available tasks"
,
)
raise
ValueError
(
f
"Tasks
{
missing
}
were not found."
)
if
args
.
output_path
:
if
args
.
output_path
:
path
=
Path
(
args
.
output_path
)
path
=
Path
(
args
.
output_path
)
...
@@ -195,8 +209,8 @@ def main():
...
@@ -195,8 +209,8 @@ def main():
f
"batch_size:
{
args
.
batch_size
}{
f
' (
{
batch_sizes
}
)
' if batch_sizes else ''
}
"
f
"batch_size:
{
args
.
batch_size
}{
f
' (
{
batch_sizes
}
)
' if batch_sizes else ''
}
"
)
)
print
(
evaluator
.
make_table
(
results
))
print
(
evaluator
.
make_table
(
results
))
if
"
aggregate
"
in
results
:
if
"
groups
"
in
results
:
print
(
evaluator
.
make_table
(
results
,
"
aggregate
"
))
print
(
evaluator
.
make_table
(
results
,
"
groups
"
))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
mypy.ini
0 → 100644
View file @
3263c572
[mypy]
python_version
=
3.9
show_traceback
=
True
check_untyped_defs
=
True
no_implicit_reexport
=
True
warn_unreachable
=
True
warn_unused_configs
=
True
warn_unused_ignores
=
True
warn_redundant_casts
=
True
# We ignore errors everywhere to gradually add type annotations
[mypy-lm_eval.*]
ignore_errors
=
True
[mypy-lm_eval.api.*]
ignore_errors
=
True
[mypy-lm_eval.prompts.*]
ignore_errors
=
True
[mypy-lm_eval.models.*]
ignore_errors
=
True
[mypy-scripts.*]
ignore_errors
=
True
[mypy-main]
ignore_errors
=
True
pyproject.toml
0 → 100644
View file @
3263c572
[build-system]
requires
=
[
"setuptools>=40.8.0"
,
"wheel"
]
build-backend
=
"setuptools.build_meta"
[project]
name
=
"lm_eval"
version
=
"1.0.0"
authors
=
[
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
]
description
=
"A framework for evaluating language models"
readme
=
"README.md"
classifiers
=
[
"Development Status :: 3 - Alpha"
,
"Programming Language :: Python :: 3"
,
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
]
requires-python
=
">=3.9"
license
=
{
"text"
=
"MIT"
}
dependencies
=
[
"accelerate>=0.21.0"
,
"evaluate"
,
"datasets>=2.0.0"
,
"evaluate>=0.4.0"
,
"jsonlines"
,
"numexpr"
,
"peft>=0.2.0"
,
"pybind11>=2.6.2"
,
"pytablewriter"
,
"rouge-score>=0.0.4"
,
"sacrebleu>=1.5.0"
,
"scikit-learn>=0.24.1"
,
"sqlitedict"
,
"torch>=1.8"
,
"tqdm-multiprocess"
,
"transformers>=4.1"
,
"zstandard"
,
]
[tool.setuptools]
packages
=
["lm_eval"]
# required to include yaml files in pip installation
[tool.setuptools.package-data]
lm_eval
=
[
"**/*.yaml"
,
"tasks/**/*"
]
examples
=
["**/*.yaml"]
[project.scripts]
lm-eval
=
"main:main"
lm_eval
=
"main:main"
[project.urls]
Homepage
=
"https://github.com/EleutherAI/lm-evaluation-harness"
Repository
=
"https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
dev
=
[
"black"
,
"flake8"
,
"pre-commit"
,
"pytest"
,
"pytest-cov"
]
linting
=
[
"flake8"
,
"pylint"
,
"mypy"
,
"pre-commit"
,
]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
multilingual
=
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
,
"pycountry"
]
sentencepiece
=
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
]
promptsource
=
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
]
gptq
=
["auto-gptq[triton]
@
git+https://github.com/PanQiWei/AutoGPTQ
"]
anthropic
=
["anthropic"]
openai
=
[
"openai"
,
"tiktoken"
]
all
=
[
"lm_eval[dev]"
,
"lm_eval[testing]"
,
"lm_eval[linting]"
,
"lm_eval[multilingual]"
,
"lm_eval[sentencepiece]"
,
"lm_eval[promptsource]"
,
"lm_eval[gptq]"
,
"lm_eval[anthropic]"
,
"lm_eval[openai]"
]
scripts/write_out.py
View file @
3263c572
...
@@ -38,13 +38,15 @@ def main():
...
@@ -38,13 +38,15 @@ def main():
iters
=
[]
iters
=
[]
for
set
in
args
.
sets
.
split
(
","
):
for
set
in
args
.
sets
.
split
(
","
):
docs
=
None
if
set
==
"train"
and
task
.
has_training_docs
():
if
set
==
"train"
and
task
.
has_training_docs
():
docs
=
task
.
training_docs
()
docs
=
task
.
training_docs
()
if
set
==
"val"
and
task
.
has_validation_docs
():
if
set
==
"val"
and
task
.
has_validation_docs
():
docs
=
task
.
validation_docs
()
docs
=
task
.
validation_docs
()
if
set
==
"test"
and
task
.
has_test_docs
():
if
set
==
"test"
and
task
.
has_test_docs
():
docs
=
task
.
test_docs
()
docs
=
task
.
test_docs
()
iters
.
append
(
docs
)
if
docs
is
not
None
:
iters
.
append
(
docs
)
docs
=
join_iters
(
iters
)
docs
=
join_iters
(
iters
)
...
...
setup.py
View file @
3263c572
import
setuptools
import
setuptools
import
itertools
with
open
(
"README.md"
,
"r"
,
encoding
=
"utf-8"
)
as
fh
:
# This is to make sure that the package supports editable installs
long_description
=
fh
.
read
()
setuptools
.
setup
()
extras_require
=
{
"dev"
:
[
"black"
,
"flake8"
,
"pre-commit"
,
"pytest"
,
"pytest-cov"
],
"linting"
:
[
"flake8"
,
"pylint"
,
"mypy"
,
"pre-commit"
,
],
"testing"
:
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
],
"multilingual"
:
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
],
"sentencepiece"
:
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
],
"promptsource"
:
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"anthropic"
:
[
"anthropic"
],
"openai"
:
[
"openai"
,
"tiktoken"
],
}
extras_require
[
"all"
]
=
list
(
itertools
.
chain
.
from_iterable
(
extras_require
.
values
()))
setuptools
.
setup
(
name
=
"lm_eval"
,
version
=
"1.0.0"
,
author
=
"EleutherAI"
,
author_email
=
"contact@eleuther.ai"
,
description
=
"A framework for evaluating language models"
,
long_description
=
long_description
,
long_description_content_type
=
"text/markdown"
,
url
=
"https://github.com/EleutherAI/lm-evaluation-harness"
,
packages
=
setuptools
.
find_packages
(),
# required to include yaml files in pip installation
package_data
=
{
"lm_eval"
:
[
"**/*.yaml"
,
"tasks/**/*"
],
"examples"
:
[
"**/*.yaml"
],
},
entry_points
=
{
"console_scripts"
:
[
"lm-eval = main:main"
,
"lm_eval = main:main"
],
},
include_package_data
=
True
,
classifiers
=
[
"Development Status :: 3 - Alpha"
,
"Programming Language :: Python :: 3"
,
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
],
python_requires
=
">=3.9"
,
install_requires
=
[
"accelerate>=0.18.0"
,
"evaluate"
,
"datasets>=2.0.0"
,
"evaluate>=0.4.0"
,
"jsonlines"
,
"numexpr"
,
"omegaconf>=2.2"
,
"peft>=0.2.0"
,
"pybind11>=2.6.2"
,
"pycountry"
,
"pytablewriter"
,
"rouge-score>=0.0.4"
,
"sacrebleu==1.5.0"
,
"scikit-learn>=0.24.1"
,
"sqlitedict"
,
"torch>=1.8"
,
"tqdm-multiprocess"
,
"transformers>=4.1"
,
"zstandard"
,
],
extras_require
=
extras_require
,
)
tests/extra/test_new_tasks.py
deleted
100644 → 0
View file @
a27e8ed1
import
pytest
from
itertools
import
islice
import
lm_eval.tasks
as
tasks
from
.utilities_testing
import
load_changed_files
,
parser
from
typing
import
List
from
lm_eval.api.task
import
ConfigurableTask
import
os
# GitHub CI
def
new_tasks
()
->
List
[
str
]:
FILENAME
=
".github/outputs/tasks_all_changed_and_modified_files.txt"
if
os
.
path
.
exists
(
FILENAME
):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return
parser
(
load_changed_files
(
FILENAME
))
elif
os
.
getenv
(
"API"
)
is
not
None
:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return
[
"arc_easy"
,
"hellaswag"
,
"piqa"
,
"wikitext"
]
# if both not true just do arc_easy
else
:
return
[
"arc_easy"
]
def
get_task_class
()
->
List
[
ConfigurableTask
]:
task_name
=
new_tasks
()
x
=
[
cls
for
name
,
cls
in
tasks
.
TASK_REGISTRY
.
items
()
if
name
in
task_name
]
return
x
@
pytest
.
fixture
()
def
limit
()
->
int
:
return
10
# Tests
@
pytest
.
mark
.
parametrize
(
"task_class"
,
get_task_class
())
class
TestNewTasks
:
def
test_download
(
self
,
task_class
:
ConfigurableTask
):
task_class
().
download
()
assert
task_class
().
dataset
is
not
None
def
test_has_training_docs
(
self
,
task_class
:
ConfigurableTask
):
assert
task_class
().
has_training_docs
()
in
[
True
,
False
]
def
test_check_training_docs
(
self
,
task_class
:
ConfigurableTask
):
task
=
task_class
()
if
task
.
has_training_docs
():
assert
task
.
_config
[
"training_split"
]
is
not
None
def
test_has_validation_docs
(
self
,
task_class
):
assert
task_class
().
has_validation_docs
()
in
[
True
,
False
]
def
test_check_validation_docs
(
self
,
task_class
):
task
=
task_class
()
if
task
.
has_validation_docs
():
assert
task
.
_config
[
"validation_split"
]
is
not
None
def
test_has_test_docs
(
self
,
task_class
):
assert
task_class
().
has_test_docs
()
in
[
True
,
False
]
def
test_check_test_docs
(
self
,
task_class
):
task
=
task_class
()
if
task
.
has_test_docs
():
assert
task
.
_config
[
"test_split"
]
is
not
None
def
test_should_decontaminate
(
self
,
task_class
):
task
=
task_class
()
assert
task
.
should_decontaminate
()
in
[
True
,
False
]
if
task
.
should_decontaminate
():
assert
task
.
_config
[
"doc_to_decontamination_query"
]
is
not
None
def
test_doc_to_text
(
self
,
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert
all
(
isinstance
(
x
,
str
)
and
(
x
[
-
1
]
!=
" "
if
len
(
x
)
!=
0
else
True
)
for
x
in
_array
)
def
test_create_choices
(
self
,
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
if
"multiple_choice"
in
task
.
_config
.
output_type
:
_array
=
[
task
.
doc_to_choice
(
doc
)
for
doc
in
arr
]
# assert all(len(x) == 4 for x in _array)
assert
all
(
isinstance
(
x
,
list
)
for
x
in
_array
)
assert
all
(
isinstance
(
x
[
0
],
str
)
for
x
in
_array
)
def
test_doc_to_target
(
self
,
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array_target
=
[
task
.
doc_to_target
(
doc
)
for
doc
in
arr
]
if
task
.
_config
.
output_type
==
"multiple_choice"
:
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def
test_build_all_requests
(
self
,
task_class
,
limit
):
task_class
().
build_all_requests
(
rank
=
1
,
limit
=
limit
,
world_size
=
1
)
assert
task_class
.
instances
is
not
None
# ToDO: Add proper testing
def
test_construct_requests
(
self
,
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
requests
=
[
task
.
construct_requests
(
doc
,
task
.
doc_to_text
(
doc
))
for
doc
in
arr
]
# assert all(isinstance(doc, list) for doc in requests)
assert
len
(
requests
)
==
limit
if
limit
else
True
tests/extra/test_utils.py
deleted
100644 → 0
View file @
a27e8ed1
import
json
from
typing
import
List
from
lm_eval.utils
import
load_yaml_config
from
pathlib
import
Path
FILE_PATH
=
file_path
=
".github/outputs/tasks_all_changed_and_modified_files.txt"
def
load_changed_files
(
file_path
:
str
=
FILE_PATH
)
->
List
[
str
]:
with
open
(
file_path
,
"r"
)
as
f
:
return
[
l
for
line
in
f
.
readlines
()
for
l
in
line
.
strip
().
split
(
" "
)]
def
parser
(
full_path
:
List
[
str
])
->
List
[
str
]:
_output
=
set
()
for
x
in
full_path
:
if
x
.
endswith
(
".yaml"
):
_output
.
add
(
load_yaml_config
(
x
)[
"task"
])
elif
x
.
endswith
(
".py"
):
path
=
[
str
(
x
)
for
x
in
(
list
(
Path
(
x
).
parent
.
glob
(
"*.yaml"
)))]
_output
|=
{
load_yaml_config
(
x
)[
"task"
]
for
x
in
path
}
return
list
(
_output
)
tests/models/test_huggingface.py
View file @
3263c572
from
__future__
import
annotations
from
__future__
import
annotations
import
pytest
import
pytest
from
pathlib
import
Path
import
numpy
as
np
import
numpy
as
np
from
lm_eval.models.huggingface
import
HFLM
from
lm_eval.models.huggingface
import
HFLM
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.instance
import
Instance
import
lm_eval.tasks
as
tasks
import
lm_eval.tasks
as
tasks
import
sys
import
torch
class
Test_HFLM
:
class
Test_HFLM
:
torch
.
use_deterministic_algorithms
(
True
)
version_minor
=
sys
.
version_info
.
minor
multiple_choice_task
=
tasks
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
=
tasks
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
...
@@ -90,8 +94,15 @@ class Test_HFLM:
...
@@ -90,8 +94,15 @@ class Test_HFLM:
def
test_logliklihood
(
self
)
->
None
:
def
test_logliklihood
(
self
)
->
None
:
res
=
self
.
LM
.
loglikelihood
(
self
.
MULTIPLE_CH
)
res
=
self
.
LM
.
loglikelihood
(
self
.
MULTIPLE_CH
)
_RES
,
_res
=
self
.
MULTIPLE_CH_RES
,
[
r
[
0
]
for
r
in
res
]
_RES
,
_res
=
self
.
MULTIPLE_CH_RES
,
[
r
[
0
]
for
r
in
res
]
# change atol in case of consistent failure
# log samples to CI
assert
np
.
allclose
(
_res
,
_RES
,
atol
=
1e-4
)
dir_path
=
Path
(
"test_logs"
)
dir_path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
file_path
=
dir_path
/
f
"outputs_log_
{
self
.
version_minor
}
.txt"
file_path
=
file_path
.
resolve
()
with
open
(
file_path
,
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
(
str
(
x
)
for
x
in
_res
))
assert
np
.
allclose
(
_res
,
_RES
,
atol
=
1e-2
)
# check indices for Multiple Choice
# check indices for Multiple Choice
argmax_RES
,
argmax_res
=
np
.
argmax
(
argmax_RES
,
argmax_res
=
np
.
argmax
(
np
.
array
(
_RES
).
reshape
(
-
1
,
4
),
axis
=
1
np
.
array
(
_RES
).
reshape
(
-
1
,
4
),
axis
=
1
...
...
tests/test_evaluator.py
View file @
3263c572
...
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
...
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
# import lm_eval.models as models
# import lm_eval.models as models
import
lm_eval.api
as
api
import
lm_eval.api
as
api
import
lm_eval.evaluator
as
evaluator
import
lm_eval.evaluator
as
evaluator
from
typing
import
List
import
random
import
random
import
pytest
import
pytest
...
@@ -26,7 +27,7 @@ import pytest
...
@@ -26,7 +27,7 @@ import pytest
)
)
],
],
)
)
def
test_evaluator
(
task_name
:
l
ist
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
):
def
test_evaluator
(
task_name
:
L
ist
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
):
task_name
=
task_name
task_name
=
task_name
limit
=
10
limit
=
10
...
...
tests/test_tasks.py
View file @
3263c572
from
itertools
import
islice
from
itertools
import
islice
import
pytest
import
pytest
from
typing
import
List
from
.utils
import
new_tasks
import
lm_eval.tasks
as
tasks
import
lm_eval.tasks
as
tasks
from
lm_eval.api.task
import
ConfigurableTask
from
lm_eval.api.task
import
ConfigurableTask
# Using fixtures to get the task class and limit
@
pytest
.
fixture
()
# Default Task
def
task_class
()
->
ConfigurableTask
:
TASKS
=
[
"arc_easy"
]
task_name
=
[
"arc_easy"
]
x
=
[
cls
for
name
,
cls
in
tasks
.
TASK_REGISTRY
.
items
()
if
name
in
task_name
]
return
x
[
0
]
def
task_class
():
global
TASKS
# CI: new_tasks checks if any modifications have been made
task_classes
=
new_tasks
()
# Check if task_classes is empty
if
task_classes
:
return
[
tasks
.
TASK_REGISTRY
.
get
(
x
)()
for
x
in
task_classes
]
else
:
return
[
tasks
.
TASK_REGISTRY
.
get
(
x
)()
for
x
in
TASKS
]
@
pytest
.
fixture
()
@
pytest
.
fixture
()
...
@@ -18,109 +26,96 @@ def limit() -> int:
...
@@ -18,109 +26,96 @@ def limit() -> int:
# Tests
# Tests
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
())
class
TestNewTasks
:
def
test_download
(
task_class
:
ConfigurableTask
):
def
test_download
(
self
,
task_class
:
ConfigurableTask
):
task_class
().
download
()
task_class
.
download
()
assert
task_class
().
dataset
is
not
None
assert
task_class
.
dataset
is
not
None
def
test_has_training_docs
(
self
,
task_class
:
ConfigurableTask
):
def
test_has_training_docs
(
task_class
:
ConfigurableTask
):
assert
task_class
.
has_training_docs
()
in
[
True
,
False
]
assert
task_class
().
has_training_docs
()
in
[
True
,
False
]
def
test_check_training_docs
(
self
,
task_class
:
ConfigurableTask
):
if
task_class
.
has_training_docs
():
def
test_check_training_docs
(
task_class
:
ConfigurableTask
):
assert
task_class
.
_config
[
"training_split"
]
is
not
None
task
=
task_class
()
if
task
.
has_training_docs
():
def
test_has_validation_docs
(
self
,
task_class
):
assert
task
.
_config
[
"training_split"
]
is
not
None
assert
task_class
.
has_validation_docs
()
in
[
True
,
False
]
def
test_check_validation_docs
(
self
,
task_class
):
def
test_has_validation_docs
(
task_class
):
if
task_class
.
has_validation_docs
():
assert
task_class
().
has_validation_docs
()
in
[
True
,
False
]
assert
task_class
.
_config
[
"validation_split"
]
is
not
None
def
test_has_test_docs
(
self
,
task_class
):
def
test_check_validation_docs
(
task_class
):
assert
task_class
.
has_test_docs
()
in
[
True
,
False
]
task
=
task_class
()
if
task
.
has_validation_docs
():
def
test_check_test_docs
(
self
,
task_class
):
assert
task
.
_config
[
"validation_split"
]
is
not
None
task
=
task_class
if
task
.
has_test_docs
():
assert
task
.
_config
[
"test_split"
]
is
not
None
def
test_has_test_docs
(
task_class
):
assert
task_class
().
has_test_docs
()
in
[
True
,
False
]
def
test_should_decontaminate
(
self
,
task_class
):
task
=
task_class
assert
task
.
should_decontaminate
()
in
[
True
,
False
]
def
test_check_test_docs
(
task_class
):
if
task
.
should_decontaminate
():
task
=
task_class
()
assert
task
.
_config
[
"doc_to_decontamination_query"
]
is
not
None
if
task
.
has_test_docs
():
assert
task
.
_config
[
"test_split"
]
is
not
None
def
test_doc_to_text
(
self
,
task_class
,
limit
):
task
=
task_class
arr
=
(
def
test_should_decontaminate
(
task_class
):
list
(
islice
(
task
.
test_docs
(),
limit
))
task
=
task_class
()
if
task
.
has_test_docs
()
assert
task
.
should_decontaminate
()
in
[
True
,
False
]
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
if
task
.
should_decontaminate
():
)
assert
task
.
_config
[
"doc_to_decontamination_query"
]
is
not
None
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert
all
(
def
test_doc_to_text
(
task_class
,
limit
):
isinstance
(
x
,
str
)
and
(
x
[
-
1
]
!=
" "
if
len
(
x
)
!=
0
else
True
)
task
=
task_class
()
for
x
in
_array
arr
=
(
)
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
def
test_create_choices
(
self
,
task_class
,
limit
):
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
task
=
task_class
)
arr
=
(
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
list
(
islice
(
task
.
test_docs
(),
limit
))
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
if
task
.
has_test_docs
()
assert
all
(
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
isinstance
(
x
,
str
)
and
(
x
[
-
1
]
!=
" "
if
len
(
x
)
!=
0
else
True
)
for
x
in
_array
)
)
if
"multiple_choice"
in
task
.
_config
.
output_type
:
_array
=
[
task
.
doc_to_choice
(
doc
)
for
doc
in
arr
]
# assert all(len(x) == 4 for x in _array)
def
test_create_choices
(
task_class
,
limit
):
assert
all
(
isinstance
(
x
,
list
)
for
x
in
_array
)
task
=
task_class
()
assert
all
(
isinstance
(
x
[
0
],
str
)
for
x
in
_array
)
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
def
test_doc_to_target
(
self
,
task_class
,
limit
):
if
task
.
has_test_docs
()
task
=
task_class
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
arr
=
(
)
list
(
islice
(
task
.
test_docs
(),
limit
))
if
"multiple_choice"
in
task
.
_config
.
output_type
:
if
task
.
has_test_docs
()
_array
=
[
task
.
doc_to_choice
(
doc
)
for
doc
in
arr
]
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
# assert all(len(x) == 4 for x in _array)
)
assert
all
(
isinstance
(
x
,
list
)
for
x
in
_array
)
_array_target
=
[
task
.
doc_to_target
(
doc
)
for
doc
in
arr
]
assert
all
(
isinstance
(
x
[
0
],
str
)
for
x
in
_array
)
if
task
.
_config
.
output_type
==
"multiple_choice"
:
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
# _array_text = [task.doc_to_text(doc) for doc in arr]
def
test_doc_to_target
(
task_class
,
limit
):
# Not working
task
=
task_class
()
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
def
test_build_all_requests
(
self
,
task_class
,
limit
):
if
task
.
has_test_docs
()
task_class
.
build_all_requests
(
rank
=
1
,
limit
=
limit
,
world_size
=
1
)
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
assert
task_class
.
instances
is
not
None
)
_array_target
=
[
task
.
doc_to_target
(
doc
)
for
doc
in
arr
]
# ToDO: Add proper testing
if
task
.
_config
.
output_type
==
"multiple_choice"
:
def
test_construct_requests
(
self
,
task_class
,
limit
):
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
task
=
task_class
# _array_text = [task.doc_to_text(doc) for doc in arr]
arr
=
(
# Not working
list
(
islice
(
task
.
test_docs
(),
limit
))
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
def
test_build_all_requests
(
task_class
,
limit
):
requests
=
[
task
.
construct_requests
(
doc
,
task
.
doc_to_text
(
doc
))
for
doc
in
arr
]
task_class
().
build_all_requests
(
rank
=
1
,
limit
=
limit
,
world_size
=
1
)
# assert all(isinstance(doc, list) for doc in requests)
assert
task_class
.
instances
is
not
None
assert
len
(
requests
)
==
limit
if
limit
else
True
# ToDO: Add proper testing
def
test_construct_requests
(
task_class
,
limit
):
task
=
task_class
()
arr
=
(
list
(
islice
(
task
.
test_docs
(),
limit
))
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
requests
=
[
task
.
construct_requests
(
doc
,
task
.
doc_to_text
(
doc
))
for
doc
in
arr
]
# assert all(isinstance(doc, list) for doc in requests)
assert
len
(
requests
)
==
limit
if
limit
else
True
# def test_create_choices(task_class):
# def test_create_choices(task_class):
...
...
tests/
extra/utilities_testing
.py
→
tests/
utils
.py
View file @
3263c572
import
json
from
typing
import
List
from
typing
import
List
from
lm_eval.utils
import
load_yaml_config
from
lm_eval.utils
import
load_yaml_config
from
pathlib
import
Path
from
pathlib
import
Path
import
sys
from
typing
import
Union
import
os
# {{{CI}}}
# This is the path where the output for the changed files for the tasks folder is stored
# This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
...
@@ -14,7 +16,6 @@ def load_changed_files(file_path: str) -> List[str]:
...
@@ -14,7 +16,6 @@ def load_changed_files(file_path: str) -> List[str]:
with
open
(
file_path
,
"r"
)
as
f
:
with
open
(
file_path
,
"r"
)
as
f
:
content
=
f
.
read
()
content
=
f
.
read
()
words_list
=
[
x
for
x
in
content
.
split
()]
words_list
=
[
x
for
x
in
content
.
split
()]
sys
.
stdout
.
write
(
f
"list of files:
{
words_list
}
"
)
return
words_list
return
words_list
...
@@ -30,3 +31,18 @@ def parser(full_path: List[str]) -> List[str]:
...
@@ -30,3 +31,18 @@ def parser(full_path: List[str]) -> List[str]:
path
=
[
str
(
x
)
for
x
in
(
list
(
Path
(
x
).
parent
.
glob
(
"*.yaml"
)))]
path
=
[
str
(
x
)
for
x
in
(
list
(
Path
(
x
).
parent
.
glob
(
"*.yaml"
)))]
_output
|=
{
load_yaml_config
(
x
)[
"task"
]
for
x
in
path
}
_output
|=
{
load_yaml_config
(
x
)[
"task"
]
for
x
in
path
}
return
list
(
_output
)
return
list
(
_output
)
def
new_tasks
()
->
Union
[
List
[
str
],
None
]:
FILENAME
=
".github/outputs/tasks_all_changed_and_modified_files.txt"
if
os
.
path
.
exists
(
FILENAME
):
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return
parser
(
load_changed_files
(
FILENAME
))
elif
os
.
getenv
(
"API"
)
is
not
None
:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return
[
"arc_easy"
,
"hellaswag"
,
"piqa"
,
"wikitext"
]
# if both not true just do arc_easy
else
:
return
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment