Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
90ad5db7
Commit
90ad5db7
authored
Mar 01, 2024
by
lintangsutawika
Browse files
merged main
parents
f692caa9
b177c82c
Changes
484
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
273 additions
and
4 deletions
+273
-4
scripts/requests_caching.py
scripts/requests_caching.py
+92
-0
tests/models/test_huggingface.py
tests/models/test_huggingface.py
+2
-2
tests/test_requests_caching.py
tests/test_requests_caching.py
+123
-0
tests/test_utils.py
tests/test_utils.py
+56
-2
No files found.
scripts/requests_caching.py
0 → 100644
View file @
90ad5db7
"""
Usage:
python requests_caching.py --tasks=comma,separated,list,of,tasks --cache_requests=<true|refresh|delete]>
"""
import
argparse
import
os
from
typing
import
List
import
torch
from
transformers
import
(
pipeline
as
trans_pipeline
,
)
from
lm_eval
import
simple_evaluate
from
lm_eval.evaluator
import
request_caching_arg_to_dict
from
lm_eval.utils
import
eval_logger
MODULE_DIR
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
# Used to specify alternate cache path, useful if run in a docker container
# NOTE raw datasets will break if you try to transfer the cache from your host to a docker image
LM_HARNESS_CACHE_PATH
=
os
.
getenv
(
"LM_HARNESS_CACHE_PATH"
)
DEVICE
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
MODEL
=
"EleutherAI/pythia-70m"
TASK
=
"text-generation"
def
run_model_for_task_caching
(
tasks
:
List
[
str
],
cache_requests
:
str
):
eval_logger
.
info
(
f
"Loading HF model:
{
MODEL
}
"
)
trans_pipe
=
trans_pipeline
(
task
=
TASK
,
model
=
MODEL
,
device
=
DEVICE
,
trust_remote_code
=
True
)
model
=
trans_pipe
.
model
tokenizer
=
trans_pipe
.
tokenizer
eval_logger
.
info
(
f
"Running simple_evaluate to cache request objects for tasks:
{
tasks
}
"
)
cache_args
=
request_caching_arg_to_dict
(
cache_requests
=
cache_requests
)
eval_logger
.
info
(
f
"The following operations will be performed on the cache:
{
cache_requests
}
"
)
eval_data
=
simple_evaluate
(
model
=
"hf-auto"
,
model_args
=
{
"pretrained"
:
model
,
"tokenizer"
:
tokenizer
,
},
limit
=
1
,
device
=
DEVICE
,
tasks
=
tasks
,
write_out
=
True
,
**
cache_args
,
)
return
eval_data
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--tasks"
,
"-t"
,
default
=
None
,
metavar
=
"task1,task2"
,
)
parser
.
add_argument
(
"--cache_requests"
,
type
=
str
,
default
=
None
,
choices
=
[
"true"
,
"refresh"
,
"delete"
],
help
=
"Speed up evaluation by caching the building of dataset requests. `None` if not caching."
,
)
args
=
parser
.
parse_args
()
tasks
=
args
.
tasks
.
split
(
","
)
eval_data
=
run_model_for_task_caching
(
tasks
=
tasks
,
model
=
MODEL
,
device
=
DEVICE
,
cache_requests
=
args
.
cache_requests
)
tests/models/test_huggingface.py
View file @
90ad5db7
...
...
@@ -22,8 +22,8 @@ class Test_HFLM:
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
MULTIPLE_CH
:
list
[
Instance
]
=
multiple_choice_task
.
instances
generate_until_task
=
task_list
[
"gsm8k"
]
# type: ignore
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until_task
.
_config
.
generation_kwargs
[
"max_gen_toks"
]
=
10
generate_until_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
generate_until
:
list
[
Instance
]
=
generate_until_task
.
instances
rolling_task
=
task_list
[
"wikitext"
]
# type: ignore
rolling_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
...
...
@@ -74,7 +74,7 @@ class Test_HFLM:
generate_until_RES
=
[
" The average of $2.50 each is $"
,
" A robe takes 2 bolts of blue fiber and half"
,
" $50,000 in repairs."
,
" $50,000 in repairs.
\n\n
Question
"
,
" He runs 1 sprint 3 times a week."
,
" They feed each of her chickens three cups of mixed"
,
" The price of the glasses is $5, but"
,
...
...
tests/test_requests_caching.py
0 → 100644
View file @
90ad5db7
# import lm_eval.base as base
import
importlib
import
os
import
sys
from
datetime
import
datetime
from
typing
import
List
,
Tuple
import
pytest
import
torch
# import lm_eval.models as models
from
lm_eval.caching.cache
import
PATH
MODULE_DIR
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
# NOTE the script this loads uses simple evaluate
# TODO potentially test both the helper script and the normal script
sys
.
path
.
append
(
f
"
{
MODULE_DIR
}
/../scripts"
)
model_loader
=
importlib
.
import_module
(
"requests_caching"
)
run_model_for_task_caching
=
model_loader
.
run_model_for_task_caching
DEFAULT_TASKS
=
[
"lambada_openai"
,
"hellaswag"
]
@
pytest
.
fixture
(
autouse
=
True
)
def
setup_and_teardown
():
# Setup
torch
.
use_deterministic_algorithms
(
False
)
clear_cache
()
# Yields control back to the test function
yield
# Cleanup here
def
clear_cache
():
if
os
.
path
.
exists
(
PATH
):
cache_files
=
os
.
listdir
(
PATH
)
for
file
in
cache_files
:
file_path
=
f
"
{
PATH
}
/
{
file
}
"
os
.
unlink
(
file_path
)
# leaving tasks here to allow for the option to select specific task files
def
get_cache_files
(
tasks
:
List
[
str
]
=
None
)
->
Tuple
[
List
[
str
],
List
[
str
]]:
cache_files
=
os
.
listdir
(
PATH
)
file_task_names
=
[]
for
file
in
cache_files
:
file_without_prefix
=
file
.
split
(
"-"
)[
1
]
file_without_prefix_and_suffix
=
file_without_prefix
.
split
(
"."
)[
0
]
file_task_names
.
append
(
file_without_prefix_and_suffix
)
return
cache_files
,
file_task_names
def
assert_created
(
tasks
:
List
[
str
],
file_task_names
:
List
[
str
]):
tasks
.
sort
()
file_task_names
.
sort
()
assert
tasks
==
file_task_names
@
pytest
.
mark
.
parametrize
(
"tasks"
,
[
DEFAULT_TASKS
])
def
test_requests_caching_true
(
tasks
:
List
[
str
]):
run_model_for_task_caching
(
tasks
=
tasks
,
cache_requests
=
"true"
)
cache_files
,
file_task_names
=
get_cache_files
()
assert_created
(
tasks
=
tasks
,
file_task_names
=
file_task_names
)
@
pytest
.
mark
.
parametrize
(
"tasks"
,
[
DEFAULT_TASKS
])
def
test_requests_caching_refresh
(
tasks
:
List
[
str
]):
run_model_for_task_caching
(
tasks
=
tasks
,
cache_requests
=
"true"
)
timestamp_before_test
=
datetime
.
now
().
timestamp
()
run_model_for_task_caching
(
tasks
=
tasks
,
cache_requests
=
"refresh"
)
cache_files
,
file_task_names
=
get_cache_files
()
for
file
in
cache_files
:
modification_time
=
os
.
path
.
getmtime
(
f
"
{
PATH
}
/
{
file
}
"
)
assert
modification_time
>
timestamp_before_test
tasks
.
sort
()
file_task_names
.
sort
()
assert
tasks
==
file_task_names
@
pytest
.
mark
.
parametrize
(
"tasks"
,
[
DEFAULT_TASKS
])
def
test_requests_caching_delete
(
tasks
:
List
[
str
]):
# populate the data first, rerun this test within this test for additional confidence
test_requests_caching_true
(
tasks
=
tasks
)
run_model_for_task_caching
(
tasks
=
tasks
,
cache_requests
=
"delete"
)
cache_files
,
file_task_names
=
get_cache_files
()
assert
len
(
cache_files
)
==
0
# useful for locally running tests through the debugger
if
__name__
==
"__main__"
:
def
run_tests
():
tests
=
[
test_requests_caching_true
,
test_requests_caching_refresh
,
test_requests_caching_delete
,
]
for
test_func
in
tests
:
clear_cache
()
test_func
(
tasks
=
DEFAULT_TASKS
)
print
(
"Tests pass"
)
run_tests
()
tests/test_utils.py
View file @
90ad5db7
...
...
@@ -2,6 +2,7 @@ import itertools
import
numpy
as
np
import
pytest
import
torch
from
lm_eval.api.metrics
import
(
aggregate_subtask_metrics
,
...
...
@@ -258,12 +259,20 @@ class TestCollator:
]
return
samples
def
make_loglikelihood_sample_group
(
self
,
end
=
11
):
a
=
[((
"x"
,
"x"
),
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
],
[
x
])
for
x
in
range
(
9
)]
b
=
[
((
"x"
,
"x"
),
[
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
],
[
x
,
y
,
z
])
for
x
,
y
,
z
in
zip
(
range
(
9
),
range
(
9
,
18
),
range
(
18
,
27
))
]
return
a
+
b
@
pytest
.
mark
.
parametrize
(
"batch_size, end"
,
[(
17
,
30
),
(
8
,
61
),
(
12
,
48
),
(
0
,
9
)])
def
test_generations
(
self
,
batch_size
,
end
):
_collate_gen
=
lambda
x
:
(
-
len
(
x
[
0
]),
x
[
0
])
# noqa: E731
generation_samples
=
self
.
make_generate_sample
(
int
(
end
))
gens
=
Collator
(
generation_samples
,
_collate_gen
,
group
ing
=
True
)
gens
=
Collator
(
generation_samples
,
_collate_gen
,
group
_by
=
"gen_kwargs"
)
chunks
=
gens
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
for
chunks
in
chunks
:
...
...
@@ -292,7 +301,10 @@ class TestCollator:
def
test_loglikelihood
(
self
,
batch_size
,
end
):
_collate_log
=
lambda
x
:
(
-
len
(
x
[
1
]),
tuple
(
x
[
1
]))
# noqa: E731
loglikelihood_samples
=
self
.
make_loglikelihood_sample
(
int
(
end
))
loglikelihoods
=
Collator
(
loglikelihood_samples
,
_collate_log
,
grouping
=
False
)
loglikelihoods
=
Collator
(
loglikelihood_samples
,
_collate_log
,
)
chunks
=
loglikelihoods
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
for
chunks
in
chunks
:
...
...
@@ -309,6 +321,48 @@ class TestCollator:
reordered_output
=
loglikelihoods
.
get_original
(
output
)
assert
reordered_output
==
[
x
[
1
]
for
x
in
loglikelihood_samples
]
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
17
,
8
,
12
,
0
])
def
test_context_grouping
(
self
,
batch_size
):
def
_collate
(
x
):
toks
=
x
[
1
]
+
x
[
2
]
return
-
len
(
toks
),
tuple
(
toks
)
_collate_log
=
_collate
# noqa: E731
loglikelihood_samples
=
self
.
make_loglikelihood_sample_group
()
loglikelihoods
=
Collator
(
loglikelihood_samples
,
_collate_log
,
group_fn
=
lambda
a
:
a
[
-
2
]
+
a
[
-
1
][:
-
1
],
group_by
=
"contexts"
,
)
chunks
=
loglikelihoods
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
outputs_
=
[]
for
chunks
in
chunks
:
# check batching
if
batch_size
!=
0
:
assert
len
(
chunks
)
<=
batch_size
# check reorder
assert
all
(
len
(
chunks
[
i
][
1
])
<=
len
(
chunks
[
i
-
1
][
1
])
for
i
in
range
(
1
,
len
(
chunks
))
)
for
x
in
chunks
:
for
request_str
,
cont_toks
,
logits
in
loglikelihoods
.
get_cache
(
req_str
=
""
.
join
(
x
[
0
]),
cxt_toks
=
x
[
1
],
cont_toks
=
x
[
2
],
logits
=
torch
.
tensor
([
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
])
.
unsqueeze
(
0
)
.
unsqueeze
(
0
),
):
output
.
append
(
x
[
1
])
outputs_
.
append
(
cont_toks
)
assert
len
(
output
)
==
len
(
outputs_
)
# check indices
reordered_output
=
loglikelihoods
.
get_original
(
output
)
assert
reordered_output
==
[
x
[
1
]
for
x
in
loglikelihood_samples
]
def
test_aggregate_mean
():
# test weight_by_size is respected
...
...
Prev
1
…
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment