Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cb8889cc
Commit
cb8889cc
authored
Feb 05, 2024
by
lintangsutawika
Browse files
merged with latest update from main
parents
ec05e561
74119471
Changes
69
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
124 additions
and
41 deletions
+124
-41
scripts/regression.py
scripts/regression.py
+5
-1
scripts/write_out.py
scripts/write_out.py
+7
-8
scripts/zeno_visualize.py
scripts/zeno_visualize.py
+12
-8
tests/models/test_huggingface.py
tests/models/test_huggingface.py
+5
-4
tests/models/test_openvino.py
tests/models/test_openvino.py
+73
-0
tests/models/test_vllm.py
tests/models/test_vllm.py
+7
-4
tests/test_evaluator.py
tests/test_evaluator.py
+3
-4
tests/test_tasks.py
tests/test_tasks.py
+3
-3
tests/utils.py
tests/utils.py
+9
-9
No files found.
scripts/regression.py
View file @
cb8889cc
...
...
@@ -94,7 +94,11 @@ def eval_models(args, branch=None):
ret
=
os
.
system
(
command
)
results
[
model
]
=
json
.
load
(
open
(
output_path
))
if
ret
==
0
else
{
"results"
:
{}}
results
[
model
]
=
(
json
.
load
(
open
(
output_path
,
encoding
=
"utf-8"
))
if
ret
==
0
else
{
"results"
:
{}}
)
end_time
=
time
.
time
()
...
...
scripts/write_out.py
View file @
cb8889cc
...
...
@@ -5,7 +5,7 @@ import random
import
numpy
as
np
from
lm_eval
import
tasks
from
lm_eval.tasks
import
include_path
,
initialize_tasks
from
lm_eval.tasks
import
TaskManager
from
lm_eval.utils
import
eval_logger
,
join_iters
...
...
@@ -39,22 +39,21 @@ def main():
args
=
parse_args
()
np
.
random
.
seed
(
args
.
seed
)
initialize_tasks
(
args
.
verbosity
)
if
args
.
include_path
is
not
None
:
eval_logger
.
info
(
f
"Including path:
{
args
.
include_path
}
"
)
include_path
(
args
.
include_path
)
task_manager
=
TaskManager
(
args
.
verbosity
,
include_path
=
args
.
include_path
)
if
args
.
tasks
==
"all_tasks"
:
task_names
=
task
s
.
ALL_TASKS
task_names
=
task
_manager
.
all_tasks
else
:
task_names
=
args
.
tasks
.
split
(
","
)
task_dict
=
tasks
.
get_task_dict
(
task_names
)
task_dict
=
tasks
.
get_task_dict
(
task_names
,
task_manager
)
os
.
makedirs
(
args
.
output_base_path
,
exist_ok
=
True
)
for
task_name
,
task
in
task_dict
.
items
():
if
typ
e
(
task
)
==
tuple
:
group_name
,
task
=
task
if
isinstanc
e
(
task
,
tuple
)
:
_
,
task
=
task
rnd
=
random
.
Random
()
rnd
.
seed
(
args
.
seed
)
...
...
scripts/zeno_visualize.py
View file @
cb8889cc
...
...
@@ -69,18 +69,20 @@ def main():
model_args
=
re
.
sub
(
"/|="
,
"__"
,
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
)))[
"config"
][
"model_args"
],
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
),
encoding
=
"utf-8"
)
)[
"config"
][
"model_args"
],
)
with
open
(
Path
(
args
.
data_path
,
model
,
f
"
{
model_args
}
_
{
task
}
.jsonl"
),
"r"
Path
(
args
.
data_path
,
model
,
f
"
{
model_args
}
_
{
task
}
.jsonl"
),
"r"
,
encoding
=
"utf-8"
,
)
as
file
:
data
=
json
.
loads
(
file
.
read
())
configs
=
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
)))[
"configs"
]
configs
=
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
),
encoding
=
"utf-8"
)
)[
"configs"
]
config
=
configs
[
task
]
if
model_index
==
0
:
# Only need to assemble data for the first model
...
...
@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str):
list: A list of tasks for the model.
"""
dir_path
=
Path
(
data_path
,
model
)
config
=
(
json
.
load
(
open
(
Path
(
dir_path
,
"results.json"
)))[
"configs"
],)
config
=
(
json
.
load
(
open
(
Path
(
dir_path
,
"results.json"
),
encoding
=
"utf-8"
))[
"configs"
],
)
return
list
(
config
[
0
].
keys
())
...
...
tests/models/test_huggingface.py
View file @
cb8889cc
...
...
@@ -11,20 +11,21 @@ from lm_eval.api.instance import Instance
from
lm_eval.models.huggingface
import
HFLM
task
s
.
initialize_tasks
()
task
_manager
=
tasks
.
TaskManager
()
class
Test_HFLM
:
torch
.
use_deterministic_algorithms
(
True
)
task_list
=
task_manager
.
load_task_or_group
([
"arc_easy"
,
"gsm8k"
,
"wikitext"
])
version_minor
=
sys
.
version_info
.
minor
multiple_choice_task
=
task
s
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
multiple_choice_task
=
task
_list
[
"arc_easy"
]
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
generate_until_task
=
task
s
.
TASK_REGISTRY
.
get
(
"gsm8k"
)()
# type: ignore
generate_until_task
=
task
_list
[
"gsm8k"
]
# type: ignore
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
generate_until
:
list
[
Instance
]
=
generate_until_task
.
instances
rolling_task
=
task
s
.
TASK_REGISTRY
.
get
(
"wikitext"
)()
# type: ignore
rolling_task
=
task
_list
[
"wikitext"
]
# type: ignore
rolling_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
ROLLING
:
list
[
Instance
]
=
rolling_task
.
instances
...
...
tests/models/test_openvino.py
0 → 100644
View file @
cb8889cc
import
random
import
tempfile
import
pytest
from
optimum.intel
import
OVModelForCausalLM
from
transformers
import
AutoTokenizer
import
lm_eval.evaluator
as
evaluator
from
lm_eval.api.registry
import
get_model
SUPPORTED_ARCHITECTURES_TASKS
=
{
"facebook/opt-125m"
:
"lambada_openai"
,
"hf-internal-testing/tiny-random-gpt2"
:
"wikitext"
,
}
@
pytest
.
mark
.
parametrize
(
"model_id,task"
,
SUPPORTED_ARCHITECTURES_TASKS
.
items
())
def
test_evaluator
(
model_id
,
task
):
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
model
=
OVModelForCausalLM
.
from_pretrained
(
model_id
,
export
=
True
,
use_cache
=
True
)
model
.
save_pretrained
(
tmpdirname
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
tokenizer
.
save_pretrained
(
tmpdirname
)
lm
=
get_model
(
"openvino"
).
create_from_arg_string
(
f
"pretrained=
{
tmpdirname
}
"
,
{
"batch_size"
:
1
,
"device"
:
"cpu"
,
},
)
def
ll_fn
(
reqs
):
for
ctx
,
cont
in
[
req
.
args
for
req
in
reqs
]:
if
len
(
ctx
)
==
0
:
continue
# space convention
assert
ctx
[
-
1
]
!=
" "
assert
cont
[
0
]
==
" "
or
ctx
[
-
1
]
==
"
\n
"
res
=
[]
random
.
seed
(
42
)
for
_
in
reqs
:
res
.
append
((
-
random
.
random
(),
False
))
return
res
def
ll_perp_fn
(
reqs
):
for
(
string
,)
in
[
req
.
args
for
req
in
reqs
]:
assert
isinstance
(
string
,
str
)
res
=
[]
random
.
seed
(
42
)
for
_
in
reqs
:
res
.
append
(
-
random
.
random
())
return
res
lm
.
loglikelihood
=
ll_fn
lm
.
loglikelihood_rolling
=
ll_perp_fn
limit
=
10
evaluator
.
simple_evaluate
(
model
=
lm
,
tasks
=
[
task
],
num_fewshot
=
0
,
limit
=
limit
,
bootstrap_iters
=
10
,
)
tests/models/test_vllm.py
View file @
cb8889cc
...
...
@@ -7,6 +7,9 @@ import lm_eval.tasks as tasks
from
lm_eval.api.instance
import
Instance
task_manager
=
tasks
.
TaskManager
()
@
pytest
.
mark
.
skip
(
reason
=
"requires CUDA"
)
class
TEST_VLLM
:
vllm
=
pytest
.
importorskip
(
"vllm"
)
...
...
@@ -17,15 +20,15 @@ class TEST_VLLM:
except
ModuleNotFoundError
:
pass
torch
.
use_deterministic_algorithms
(
True
)
task
s
.
initialize_tasks
(
)
multiple_choice_task
=
task
s
.
TASK_REGISTRY
.
get
(
"arc_easy"
)()
# type: ignore
task
_list
=
task_manager
.
load_task_or_group
([
"arc_easy"
,
"gsm8k"
,
"wikitext"
]
)
multiple_choice_task
=
task
_list
[
"arc_easy"
]
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
List
[
Instance
]
=
multiple_choice_task
.
instances
generate_until_task
=
task
s
.
TASK_REGISTRY
.
get
(
"gsm8k"
)()
# type: ignore
generate_until_task
=
task
_list
[
"gsm8k"
]
# type: ignore
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
generate_until
:
List
[
Instance
]
=
generate_until_task
.
instances
rolling_task
=
task
s
.
TASK_REGISTRY
.
get
(
"wikitext"
)()
# type: ignore
rolling_task
=
task
_list
[
"wikitext"
]
# type: ignore
rolling_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
ROLLING
:
List
[
Instance
]
=
rolling_task
.
instances
...
...
tests/test_evaluator.py
View file @
cb8889cc
...
...
@@ -6,11 +6,9 @@ import pytest
# import lm_eval.models as models
import
lm_eval.api
as
api
import
lm_eval.evaluator
as
evaluator
import
lm_eval
.tasks
as
tasks
from
lm_eval
import
tasks
tasks
.
initialize_tasks
()
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
...
...
@@ -46,7 +44,8 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str
"device"
:
None
,
},
)
task_dict
=
tasks
.
get_task_dict
(
task_name
,
num_fewshot
=
0
)
task_manager
=
tasks
.
TaskManager
()
task_dict
=
tasks
.
get_task_dict
(
task_name
,
task_manager
)
e2
=
evaluator
.
evaluate
(
lm
=
lm
,
...
...
tests/test_tasks.py
View file @
cb8889cc
...
...
@@ -8,7 +8,7 @@ from lm_eval.api.task import ConfigurableTask
from
.utils
import
new_tasks
task
s
.
initialize_tasks
()
task
_manager
=
tasks
.
TaskManager
()
# Default Task
TASKS
=
[
"arc_easy"
]
...
...
@@ -19,9 +19,9 @@ def task_class():
task_classes
=
new_tasks
()
# Check if task_classes is empty
if
task_classes
:
return
[
tasks
.
TASK_REGISTRY
.
get
(
x
)()
for
x
in
task_classes
]
return
list
(
task_manager
.
load_task_or_group
(
task_classes
).
values
())
else
:
return
[
tasks
.
TASK_REGISTRY
.
get
(
x
)()
for
x
in
TASKS
]
return
list
(
task_manager
.
load_task_or_group
(
TASKS
).
values
())
@
pytest
.
fixture
()
...
...
tests/utils.py
View file @
cb8889cc
import
os
from
pathlib
import
Path
from
typing
import
List
,
Union
from
lm_eval.utils
import
load_yaml_config
...
...
@@ -20,17 +19,18 @@ def load_changed_files(file_path: str) -> List[str]:
# checks the txt file for list of changed files.
# if file ends with .yaml then check yaml
for task name
# if
file ends with .py then parse the folder for all yaml files
#
skips benchmarks folder
# if file ends with .yaml then check yaml
and load the config.
# if
the config task is a string, it's a task config.
#
if the config task is a list, it's a group config.
def
parser
(
full_path
:
List
[
str
])
->
List
[
str
]:
_output
=
set
()
for
x
in
full_path
:
if
x
.
endswith
(
".yaml"
)
and
"benchmarks"
not
in
x
:
_output
.
add
(
load_yaml_config
(
x
)[
"task"
])
elif
x
.
endswith
(
".py"
)
and
"benchmarks"
not
in
x
:
path
=
[
str
(
x
)
for
x
in
(
list
(
Path
(
x
).
parent
.
glob
(
"*.yaml"
)))]
_output
|=
{
load_yaml_config
(
x
)[
"task"
]
for
x
in
path
}
if
os
.
path
.
exists
(
x
)
and
x
.
endswith
(
".yaml"
):
config
=
load_yaml_config
(
x
,
mode
=
"simple"
)
if
isinstance
(
config
[
"task"
],
str
):
_output
.
add
(
config
[
"task"
])
elif
isinstance
(
config
[
"task"
],
list
):
_output
.
add
(
config
[
"group"
])
return
list
(
_output
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment