Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
60c9c170
Commit
60c9c170
authored
May 29, 2024
by
haileyschoelkopf
Browse files
Merge branch 'main' into inverse-scaling-tasks
parents
4b2d565b
b4cd85d4
Changes
605
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
92 additions
and
18 deletions
+92
-18
tests/models/test_openvino.py
tests/models/test_openvino.py
+19
-0
tests/models/test_vllm.py
tests/models/test_vllm.py
+1
-1
tests/test_cli.py
tests/test_cli.py
+43
-0
tests/test_evaluator.py
tests/test_evaluator.py
+19
-7
tests/test_requests_caching.py
tests/test_requests_caching.py
+10
-10
No files found.
tests/models/test_openvino.py
View file @
60c9c170
import
random
import
tempfile
from
pathlib
import
Path
import
pytest
from
optimum.intel
import
OVModelForCausalLM
...
...
@@ -71,3 +72,21 @@ def test_evaluator(model_id, task):
limit
=
limit
,
bootstrap_iters
=
10
,
)
def
test_ov_config
():
"""Test that if specified, a custom OpenVINO config is loaded correctly"""
model_id
=
"hf-internal-testing/tiny-random-gpt2"
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
config_file
=
str
(
Path
(
tmpdirname
)
/
"ov_config.json"
)
with
open
(
Path
(
config_file
),
"w"
)
as
f
:
f
.
write
(
'{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}'
)
lm
=
get_model
(
"openvino"
).
create_from_arg_string
(
f
"pretrained=
{
model_id
}
,ov_config=
{
config_file
}
"
)
assert
(
lm
.
model
.
request
.
get_compiled_model
().
get_property
(
"DYNAMIC_QUANTIZATION_GROUP_SIZE"
)
==
32
)
tests/models/test_vllm.py
View file @
60c9c170
...
...
@@ -25,8 +25,8 @@ class TEST_VLLM:
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
List
[
Instance
]
=
multiple_choice_task
.
instances
generate_until_task
=
task_list
[
"gsm8k"
]
# type: ignore
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until
:
List
[
Instance
]
=
generate_until_task
.
instances
rolling_task
=
task_list
[
"wikitext"
]
# type: ignore
rolling_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
...
...
tests/test_cli.py
0 → 100644
View file @
60c9c170
import
argparse
import
pytest
import
lm_eval.__main__
def
test_cli_parse_error
():
"""
Assert error raised if cli args argument doesn't have type
"""
with
pytest
.
raises
(
ValueError
):
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
RawTextHelpFormatter
)
parser
.
add_argument
(
"--model"
,
"-m"
,
type
=
str
,
default
=
"hf"
,
help
=
"Name of model e.g. `hf`"
)
parser
.
add_argument
(
"--tasks"
,
"-t"
,
default
=
None
,
metavar
=
"task1,task2"
,
help
=
"To get full list of tasks, use the command lm-eval --tasks list"
,
)
lm_eval
.
__main__
.
check_argument_types
(
parser
)
def
test_cli_parse_no_error
():
"""
Assert typed arguments are parsed correctly
"""
parser
=
argparse
.
ArgumentParser
(
formatter_class
=
argparse
.
RawTextHelpFormatter
)
parser
.
add_argument
(
"--model"
,
"-m"
,
type
=
str
,
default
=
"hf"
,
help
=
"Name of model e.g. `hf`"
)
parser
.
add_argument
(
"--tasks"
,
"-t"
,
type
=
str
,
default
=
None
,
metavar
=
"task1,task2"
,
help
=
"To get full list of tasks, use the command lm-eval --tasks list"
,
)
lm_eval
.
__main__
.
check_argument_types
(
parser
)
tests/test_evaluator.py
View file @
60c9c170
...
...
@@ -14,25 +14,33 @@ from lm_eval import tasks
@
pytest
.
mark
.
parametrize
(
"task_name,limit,model,model_args"
,
"task_name,limit,model,model_args
,bootstrap_iters
"
,
[
(
[
"arc_easy"
],
10
,
"hf"
,
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu"
,
)
0
,
),
(
[
"mmlu_abstract_algebra"
],
None
,
"hf"
,
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu"
,
10000
,
),
],
)
def
test_evaluator
(
task_name
:
List
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
):
task_name
=
task_name
limit
=
10
def
test_evaluator
(
task_name
:
List
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
,
bootstrap_iters
:
int
):
e1
=
evaluator
.
simple_evaluate
(
model
=
model
,
tasks
=
task_name
,
limit
=
limit
,
model_args
=
model_args
,
bootstrap_iters
=
bootstrap_iters
,
)
assert
e1
is
not
None
...
...
@@ -51,13 +59,17 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str
lm
=
lm
,
task_dict
=
task_dict
,
limit
=
limit
,
bootstrap_iters
=
bootstrap_iters
,
)
assert
e2
is
not
None
# check that caching is working
def
r
(
x
):
return
x
[
"results"
][
"arc_easy"
]
if
"arc_easy"
in
x
[
"results"
]:
return
x
[
"results"
][
"arc_easy"
]
else
:
return
x
[
"results"
][
"mmlu_abstract_algebra"
]
assert
all
(
x
==
y
...
...
tests/test_requests_caching.py
View file @
60c9c170
...
...
@@ -20,8 +20,8 @@ sys.path.append(f"{MODULE_DIR}/../scripts")
model_loader
=
importlib
.
import_module
(
"requests_caching"
)
run_model_for_task_caching
=
model_loader
.
run_model_for_task_caching
DEFAULT_TASKS
=
[
"lambada_openai"
,
"
hellaswag
"
]
os
.
environ
[
"HF_DATASETS_TRUST_REMOTE_CODE"
]
=
"1"
DEFAULT_TASKS
=
[
"lambada_openai"
,
"
sciq
"
]
@
pytest
.
fixture
(
autouse
=
True
)
...
...
@@ -64,16 +64,16 @@ def assert_created(tasks: List[str], file_task_names: List[str]):
@
pytest
.
mark
.
parametrize
(
"tasks"
,
[
DEFAULT_TASKS
])
def
test_
requests_caching_true
(
tasks
:
List
[
str
]):
def
requests_caching_true
(
tasks
:
List
[
str
]):
run_model_for_task_caching
(
tasks
=
tasks
,
cache_requests
=
"true"
)
cache_files
,
file_task_names
=
get_cache_files
()
print
(
file_task_names
)
assert_created
(
tasks
=
tasks
,
file_task_names
=
file_task_names
)
@
pytest
.
mark
.
parametrize
(
"tasks"
,
[
DEFAULT_TASKS
])
def
test_
requests_caching_refresh
(
tasks
:
List
[
str
]):
def
requests_caching_refresh
(
tasks
:
List
[
str
]):
run_model_for_task_caching
(
tasks
=
tasks
,
cache_requests
=
"true"
)
timestamp_before_test
=
datetime
.
now
().
timestamp
()
...
...
@@ -93,9 +93,9 @@ def test_requests_caching_refresh(tasks: List[str]):
@
pytest
.
mark
.
parametrize
(
"tasks"
,
[
DEFAULT_TASKS
])
def
test_
requests_caching_delete
(
tasks
:
List
[
str
]):
def
requests_caching_delete
(
tasks
:
List
[
str
]):
# populate the data first, rerun this test within this test for additional confidence
test_requests_caching_true
(
tasks
=
tasks
)
#
test_requests_caching_true(tasks=tasks)
run_model_for_task_caching
(
tasks
=
tasks
,
cache_requests
=
"delete"
)
...
...
@@ -109,9 +109,9 @@ if __name__ == "__main__":
def
run_tests
():
tests
=
[
test_requests_caching_true
,
test_requests_caching_refresh
,
test_requests_caching_delete
,
#
test_requests_caching_true,
#
test_requests_caching_refresh,
#
test_requests_caching_delete,
]
for
test_func
in
tests
:
...
...
Prev
1
…
27
28
29
30
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment