Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cda25fef
Unverified
Commit
cda25fef
authored
Jan 02, 2024
by
Lintang Sutawika
Committed by
GitHub
Jan 02, 2024
Browse files
Merge branch 'main' into standardize_metrics
parents
dfb41835
4d10ad56
Changes
249
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
110 additions
and
34 deletions
+110
-34
tests/test_janitor.py
tests/test_janitor.py
+2
-10
tests/test_misc.py
tests/test_misc.py
+3
-1
tests/test_tasks.py
tests/test_tasks.py
+6
-2
tests/test_utils.py
tests/test_utils.py
+76
-1
tests/tests_master/test_description.py
tests/tests_master/test_description.py
+2
-2
tests/tests_master/test_generate_13_grams.py
tests/tests_master/test_generate_13_grams.py
+5
-5
tests/tests_master/test_models.py
tests/tests_master/test_models.py
+3
-2
tests/tests_master/test_version_stable.py
tests/tests_master/test_version_stable.py
+9
-7
tests/utils.py
tests/utils.py
+4
-4
No files found.
tests/test_janitor.py
View file @
cda25fef
import
re
from
collections
import
defaultdict
from
collections
import
defaultdict
from
lm_eval.decontamination.janitor
import
(
from
lm_eval.decontamination.janitor
import
(
Janitor
,
Janitor
,
form_ngrams
,
form_ngrams
,
word_ngrams
,
split_indices
,
split_indices
,
word_ngrams
,
word_ngrams_indices
,
word_ngrams_indices
,
)
)
...
@@ -81,7 +80,6 @@ def test_split_indices():
...
@@ -81,7 +80,6 @@ def test_split_indices():
def
test_word_ngrams_indices
():
def
test_word_ngrams_indices
():
sequence
=
(
sequence
=
(
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
...
@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
...
@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
# Assumptions from GPT3 Paper:
# Assumptions from GPT3 Paper:
# the 200 characters to remove include punctuation and is actually a half-window
# the 200 characters to remove include punctuation and is actually a half-window
# All tests below initially test without any registered contaminants, expecting the same sequence back.
# All tests below initially test without any registered contaminants, expecting the same sequence back.
def
test_janitor1
():
def
test_janitor1
():
# First test using a 1gram and expected the first block before the filth to have some remaining
# First test using a 1gram and expected the first block before the filth to have some remaining
# characters, but the second block should be completely removed.
# characters, but the second block should be completely removed.
...
@@ -165,7 +163,6 @@ def test_janitor1():
...
@@ -165,7 +163,6 @@ def test_janitor1():
def
test_janitor2
():
def
test_janitor2
():
# Second test using a 1gram and expected the first block before the filth to have some remaining
# Second test using a 1gram and expected the first block before the filth to have some remaining
# characters, and the second block is longer then 200 characters so should also have some remaining.
# characters, and the second block is longer then 200 characters so should also have some remaining.
...
@@ -214,7 +211,6 @@ def test_janitor2():
...
@@ -214,7 +211,6 @@ def test_janitor2():
def
test_janitor3
():
def
test_janitor3
():
# Same test as above but with a 6gram.
# Same test as above but with a 6gram.
sequence
=
(
sequence
=
(
...
@@ -262,7 +258,6 @@ def test_janitor3():
...
@@ -262,7 +258,6 @@ def test_janitor3():
def
test_janitor4
():
def
test_janitor4
():
# This test adds another block to that from the previous. The middle block should be entirely
# This test adds another block to that from the previous. The middle block should be entirely
# removed as the 200 characters are removed from each side.
# removed as the 200 characters are removed from each side.
...
@@ -318,7 +313,6 @@ def test_janitor4():
...
@@ -318,7 +313,6 @@ def test_janitor4():
def
test_janitor5
():
def
test_janitor5
():
# Same as above but using multiple different filth 6grams.
# Same as above but using multiple different filth 6grams.
sequence
=
(
sequence
=
(
...
@@ -374,7 +368,6 @@ def test_janitor5():
...
@@ -374,7 +368,6 @@ def test_janitor5():
def
test_janitor6
():
def
test_janitor6
():
# Same as above but now we add 10 filths and expect the same result, the following test does 11.
# Same as above but now we add 10 filths and expect the same result, the following test does 11.
sequence
=
(
sequence
=
(
...
@@ -438,7 +431,6 @@ def test_janitor6():
...
@@ -438,7 +431,6 @@ def test_janitor6():
def
test_janitor7
():
def
test_janitor7
():
# Same as above but now we add 9 filths and expect the same result, the following test does 10.
# Same as above but now we add 9 filths and expect the same result, the following test does 10.
sequence
=
(
sequence
=
(
...
...
tests/test_misc.py
View file @
cda25fef
import
random
import
pytest
import
pytest
import
lm_eval.api.metrics
as
metrics
import
lm_eval.api.metrics
as
metrics
import
random
def
test_bootstrapping
():
def
test_bootstrapping
():
...
...
tests/test_tasks.py
View file @
cda25fef
from
itertools
import
islice
from
itertools
import
islice
import
pytest
import
pytest
from
.utils
import
new_tasks
import
lm_eval.tasks
as
tasks
import
lm_eval.tasks
as
tasks
from
lm_eval.api.task
import
ConfigurableTask
from
lm_eval.api.task
import
ConfigurableTask
from
.utils
import
new_tasks
tasks
.
initialize_tasks
()
tasks
.
initialize_tasks
()
# Default Task
# Default Task
TASKS
=
[
"arc_easy"
]
TASKS
=
[
"arc_easy"
]
...
@@ -26,7 +30,7 @@ def limit() -> int:
...
@@ -26,7 +30,7 @@ def limit() -> int:
# Tests
# Tests
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
())
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
()
,
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
)
class
TestNewTasks
:
class
TestNewTasks
:
def
test_download
(
self
,
task_class
:
ConfigurableTask
):
def
test_download
(
self
,
task_class
:
ConfigurableTask
):
task_class
.
download
()
task_class
.
download
()
...
...
tests/test_utils.py
View file @
cda25fef
from
lm_eval.utils
import
get_rolling_token_windows
,
make_disjoint_window
import
pytest
from
lm_eval.utils
import
Collator
,
get_rolling_token_windows
,
make_disjoint_window
# noinspection DuplicatedCode
# noinspection DuplicatedCode
...
@@ -220,3 +222,76 @@ def test_make_disjoint_window():
...
@@ -220,3 +222,76 @@ def test_make_disjoint_window():
)
)
assert
make_disjoint_window
(([
1
,
2
,
3
,
4
,
5
],
[
4
,
5
,
6
]))
==
([
1
,
2
,
3
],
[
4
,
5
,
6
])
assert
make_disjoint_window
(([
1
,
2
,
3
,
4
,
5
],
[
4
,
5
,
6
]))
==
([
1
,
2
,
3
],
[
4
,
5
,
6
])
assert
make_disjoint_window
(([
1
,
2
,
3
,
4
,
5
],
[
6
]))
==
([
1
,
2
,
3
,
4
,
5
],
[
6
])
assert
make_disjoint_window
(([
1
,
2
,
3
,
4
,
5
],
[
6
]))
==
([
1
,
2
,
3
,
4
,
5
],
[
6
])
class
TestCollator
:
def
make_generate_sample
(
self
,
end
=
10
):
strings
=
[
"x"
*
i
for
i
in
range
(
1
,
end
+
1
)]
gen_kwargs1
,
gen_kwargs2
=
(
{
"temperature"
:
0
},
{
"temperature"
:
0
,
"until"
:
[
"nn"
,
"
\n\n
"
]},
)
args
=
[
(
string
,
gen_kwargs1
if
i
<
len
(
strings
)
//
2
else
gen_kwargs2
)
for
i
,
string
in
enumerate
(
strings
)
]
return
args
def
make_loglikelihood_sample
(
self
,
end
=
11
):
samples
=
[
((
"x"
,
"x"
),
list
(
range
(
1
,
total_length
+
1
)))
for
total_length
in
range
(
1
,
end
+
1
)
]
return
samples
@
pytest
.
mark
.
parametrize
(
"batch_size, end"
,
[(
17
,
30
),
(
8
,
61
),
(
12
,
48
),
(
0
,
9
)])
def
test_generations
(
self
,
batch_size
,
end
):
_collate_gen
=
lambda
x
:
(
-
len
(
x
[
0
]),
x
[
0
])
# noqa: E731
generation_samples
=
self
.
make_generate_sample
(
int
(
end
))
gens
=
Collator
(
generation_samples
,
_collate_gen
,
grouping
=
True
)
chunks
=
gens
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
for
chunks
in
chunks
:
# check batching
group_one
=
end
//
2
group_two
=
end
-
end
//
2
assert
(
len
(
chunks
)
<=
batch_size
if
batch_size
!=
0
else
len
(
chunks
)
in
[
group_one
,
group_two
]
)
# check if reorder-er is working correctly
assert
all
(
len
(
chunks
[
i
][
0
])
<=
len
(
chunks
[
i
-
1
][
0
])
for
i
in
range
(
1
,
len
(
chunks
))
)
# check if grouping correctly
assert
all
(
x
[
1
]
==
chunks
[
0
][
1
]
for
x
in
chunks
)
for
x
in
chunks
:
output
.
append
(
x
)
reordered_output
=
gens
.
get_original
(
output
)
# check get original
assert
reordered_output
==
generation_samples
@
pytest
.
mark
.
parametrize
(
"batch_size, end"
,
[(
17
,
30
),
(
8
,
61
),
(
12
,
48
),
(
0
,
3
)])
def
test_loglikelihood
(
self
,
batch_size
,
end
):
_collate_log
=
lambda
x
:
(
-
len
(
x
[
1
]),
tuple
(
x
[
1
]))
# noqa: E731
loglikelihood_samples
=
self
.
make_loglikelihood_sample
(
int
(
end
))
loglikelihoods
=
Collator
(
loglikelihood_samples
,
_collate_log
,
grouping
=
False
)
chunks
=
loglikelihoods
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
for
chunks
in
chunks
:
# check batching
assert
len
(
chunks
)
<=
batch_size
if
batch_size
!=
0
else
len
(
chunks
)
==
end
# check reorder
assert
all
(
len
(
chunks
[
i
][
1
])
<=
len
(
chunks
[
i
-
1
][
1
])
for
i
in
range
(
1
,
len
(
chunks
))
)
for
x
in
chunks
:
output
.
append
(
x
[
1
])
# check indices
reordered_output
=
loglikelihoods
.
get_original
(
output
)
assert
reordered_output
==
[
x
[
1
]
for
x
in
loglikelihood_samples
]
tests/tests_master/test_description.py
View file @
cda25fef
import
random
import
random
import
lm_eval.tasks
import
lm_eval.models
import
lm_eval.models
import
lm_eval.tasks
def
test_description
():
def
test_description
():
...
@@ -14,7 +15,6 @@ def test_description():
...
@@ -14,7 +15,6 @@ def test_description():
task_dict
=
lm_eval
.
tasks
.
get_task_dict
(
task_names
)
task_dict
=
lm_eval
.
tasks
.
get_task_dict
(
task_names
)
for
task_name
,
task
in
task_dict
.
items
():
for
task_name
,
task
in
task_dict
.
items
():
# patch description field in task (# TODO: make this much more cleaned up)
# patch description field in task (# TODO: make this much more cleaned up)
task
.
_config
.
description
=
description_dict
[
task_name
]
task
.
_config
.
description
=
description_dict
[
task_name
]
...
...
tests/tests_master/test_generate_13_grams.py
View file @
cda25fef
import
glob
import
logging
import
os
import
os
from
collections
import
Counter
import
shutil
import
shutil
import
glob
from
collections
import
Counter
from
lm_eval.decontamination.archiver
import
Archive
,
TextReader
from
lm_eval.decontamination.janitor
import
Janitor
,
word_ngrams
from
lm_eval.decontamination.janitor
import
Janitor
,
word_ngrams
from
scripts.clean_training_data.generate_13_grams
import
do_ngrams_in_buckets
from
scripts.clean_training_data.generate_13_grams
import
do_ngrams_in_buckets
from
lm_eval.decontamination.archiver
import
Archive
,
TextReader
import
logging
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog):
...
@@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog):
print
(
"rebuild"
)
print
(
"rebuild"
)
rebuilt_ngrams
=
[]
rebuilt_ngrams
=
[]
bucket_file_paths
=
glob
.
glob
(
bucket_file_paths
=
glob
.
glob
(
os
.
path
.
join
(
test_working_directory
,
"output"
,
f
"*.bkt.txt"
)
os
.
path
.
join
(
test_working_directory
,
"output"
,
"*.bkt.txt"
)
)
)
for
bucket_file_path
in
bucket_file_paths
:
for
bucket_file_path
in
bucket_file_paths
:
reader
=
TextReader
(
bucket_file_path
)
reader
=
TextReader
(
bucket_file_path
)
...
...
tests/tests_master/test_models.py
View file @
cda25fef
...
@@ -2,12 +2,13 @@ import hashlib
...
@@ -2,12 +2,13 @@ import hashlib
import
json
import
json
import
os
import
os
import
pickle
import
pickle
import
pytest
import
unittest.mock
as
mock
import
unittest.mock
as
mock
import
pytest
from
openai
import
OpenAI
import
lm_eval.models
as
models
import
lm_eval.models
as
models
from
openai
import
OpenAI
client
=
OpenAI
()
client
=
OpenAI
()
...
...
tests/tests_master/test_version_stable.py
View file @
cda25fef
import
lm_eval.tasks
as
tasks
import
collections
import
lm_eval.models
as
models
import
hashlib
import
lm_eval.evaluator
as
evaluator
import
json
import
os
import
random
import
random
import
pytest
import
pytest
import
os
import
json
import
lm_eval.evaluator
as
evaluator
import
hashlib
import
lm_eval.models
as
models
import
collection
s
import
lm_eval.tasks
as
task
s
os
.
makedirs
(
"tests/testdata"
,
exist_ok
=
True
)
os
.
makedirs
(
"tests/testdata"
,
exist_ok
=
True
)
...
...
tests/utils.py
View file @
cda25fef
from
typing
import
List
from
lm_eval.utils
import
load_yaml_config
from
pathlib
import
Path
from
typing
import
Union
import
os
import
os
from
pathlib
import
Path
from
typing
import
List
,
Union
from
lm_eval.utils
import
load_yaml_config
# {{{CI}}}
# {{{CI}}}
...
...
Prev
1
…
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment