Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cda25fef
Unverified
Commit
cda25fef
authored
Jan 02, 2024
by
Lintang Sutawika
Committed by
GitHub
Jan 02, 2024
Browse files
Merge branch 'main' into standardize_metrics
parents
dfb41835
4d10ad56
Changes
249
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
110 additions
and
34 deletions
+110
-34
tests/test_janitor.py
tests/test_janitor.py
+2
-10
tests/test_misc.py
tests/test_misc.py
+3
-1
tests/test_tasks.py
tests/test_tasks.py
+6
-2
tests/test_utils.py
tests/test_utils.py
+76
-1
tests/tests_master/test_description.py
tests/tests_master/test_description.py
+2
-2
tests/tests_master/test_generate_13_grams.py
tests/tests_master/test_generate_13_grams.py
+5
-5
tests/tests_master/test_models.py
tests/tests_master/test_models.py
+3
-2
tests/tests_master/test_version_stable.py
tests/tests_master/test_version_stable.py
+9
-7
tests/utils.py
tests/utils.py
+4
-4
No files found.
tests/test_janitor.py
View file @
cda25fef
import
re
from
collections
import
defaultdict
from
lm_eval.decontamination.janitor
import
(
Janitor
,
form_ngrams
,
word_ngrams
,
split_indices
,
word_ngrams
,
word_ngrams_indices
,
)
...
...
@@ -81,7 +80,6 @@ def test_split_indices():
def
test_word_ngrams_indices
():
sequence
=
(
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
...
...
@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
# Assumptions from GPT3 Paper:
# the 200 characters to remove include punctuation and is actually a half-window
# All tests below initially test without any registered contaminants, expecting the same sequence back.
def
test_janitor1
():
# First test using a 1gram and expected the first block before the filth to have some remaining
# characters, but the second block should be completely removed.
...
...
@@ -165,7 +163,6 @@ def test_janitor1():
def
test_janitor2
():
# Second test using a 1gram and expected the first block before the filth to have some remaining
# characters, and the second block is longer then 200 characters so should also have some remaining.
...
...
@@ -214,7 +211,6 @@ def test_janitor2():
def
test_janitor3
():
# Same test as above but with a 6gram.
sequence
=
(
...
...
@@ -262,7 +258,6 @@ def test_janitor3():
def
test_janitor4
():
# This test adds another block to that from the previous. The middle block should be entirely
# removed as the 200 characters are removed from each side.
...
...
@@ -318,7 +313,6 @@ def test_janitor4():
def
test_janitor5
():
# Same as above but using multiple different filth 6grams.
sequence
=
(
...
...
@@ -374,7 +368,6 @@ def test_janitor5():
def
test_janitor6
():
# Same as above but now we add 10 filths and expect the same result, the following test does 11.
sequence
=
(
...
...
@@ -438,7 +431,6 @@ def test_janitor6():
def
test_janitor7
():
# Same as above but now we add 9 filths and expect the same result, the following test does 10.
sequence
=
(
...
...
tests/test_misc.py
View file @
cda25fef
import
random
import
pytest
import
lm_eval.api.metrics
as
metrics
import
random
def
test_bootstrapping
():
...
...
tests/test_tasks.py
View file @
cda25fef
from
itertools
import
islice
import
pytest
from
.utils
import
new_tasks
import
lm_eval.tasks
as
tasks
from
lm_eval.api.task
import
ConfigurableTask
from
.utils
import
new_tasks
tasks
.
initialize_tasks
()
# Default Task
TASKS
=
[
"arc_easy"
]
...
...
@@ -26,7 +30,7 @@ def limit() -> int:
# Tests
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
())
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
()
,
ids
=
lambda
x
:
f
"
{
x
.
config
.
task
}
"
)
class
TestNewTasks
:
def
test_download
(
self
,
task_class
:
ConfigurableTask
):
task_class
.
download
()
...
...
tests/test_utils.py
View file @
cda25fef
from
lm_eval.utils
import
get_rolling_token_windows
,
make_disjoint_window
import
pytest
from
lm_eval.utils
import
Collator
,
get_rolling_token_windows
,
make_disjoint_window
# noinspection DuplicatedCode
...
...
@@ -220,3 +222,76 @@ def test_make_disjoint_window():
)
assert
make_disjoint_window
(([
1
,
2
,
3
,
4
,
5
],
[
4
,
5
,
6
]))
==
([
1
,
2
,
3
],
[
4
,
5
,
6
])
assert
make_disjoint_window
(([
1
,
2
,
3
,
4
,
5
],
[
6
]))
==
([
1
,
2
,
3
,
4
,
5
],
[
6
])
class
TestCollator
:
def
make_generate_sample
(
self
,
end
=
10
):
strings
=
[
"x"
*
i
for
i
in
range
(
1
,
end
+
1
)]
gen_kwargs1
,
gen_kwargs2
=
(
{
"temperature"
:
0
},
{
"temperature"
:
0
,
"until"
:
[
"nn"
,
"
\n\n
"
]},
)
args
=
[
(
string
,
gen_kwargs1
if
i
<
len
(
strings
)
//
2
else
gen_kwargs2
)
for
i
,
string
in
enumerate
(
strings
)
]
return
args
def
make_loglikelihood_sample
(
self
,
end
=
11
):
samples
=
[
((
"x"
,
"x"
),
list
(
range
(
1
,
total_length
+
1
)))
for
total_length
in
range
(
1
,
end
+
1
)
]
return
samples
@
pytest
.
mark
.
parametrize
(
"batch_size, end"
,
[(
17
,
30
),
(
8
,
61
),
(
12
,
48
),
(
0
,
9
)])
def
test_generations
(
self
,
batch_size
,
end
):
_collate_gen
=
lambda
x
:
(
-
len
(
x
[
0
]),
x
[
0
])
# noqa: E731
generation_samples
=
self
.
make_generate_sample
(
int
(
end
))
gens
=
Collator
(
generation_samples
,
_collate_gen
,
grouping
=
True
)
chunks
=
gens
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
for
chunks
in
chunks
:
# check batching
group_one
=
end
//
2
group_two
=
end
-
end
//
2
assert
(
len
(
chunks
)
<=
batch_size
if
batch_size
!=
0
else
len
(
chunks
)
in
[
group_one
,
group_two
]
)
# check if reorder-er is working correctly
assert
all
(
len
(
chunks
[
i
][
0
])
<=
len
(
chunks
[
i
-
1
][
0
])
for
i
in
range
(
1
,
len
(
chunks
))
)
# check if grouping correctly
assert
all
(
x
[
1
]
==
chunks
[
0
][
1
]
for
x
in
chunks
)
for
x
in
chunks
:
output
.
append
(
x
)
reordered_output
=
gens
.
get_original
(
output
)
# check get original
assert
reordered_output
==
generation_samples
@
pytest
.
mark
.
parametrize
(
"batch_size, end"
,
[(
17
,
30
),
(
8
,
61
),
(
12
,
48
),
(
0
,
3
)])
def
test_loglikelihood
(
self
,
batch_size
,
end
):
_collate_log
=
lambda
x
:
(
-
len
(
x
[
1
]),
tuple
(
x
[
1
]))
# noqa: E731
loglikelihood_samples
=
self
.
make_loglikelihood_sample
(
int
(
end
))
loglikelihoods
=
Collator
(
loglikelihood_samples
,
_collate_log
,
grouping
=
False
)
chunks
=
loglikelihoods
.
get_batched
(
n
=
int
(
batch_size
),
batch_fn
=
None
)
output
=
[]
for
chunks
in
chunks
:
# check batching
assert
len
(
chunks
)
<=
batch_size
if
batch_size
!=
0
else
len
(
chunks
)
==
end
# check reorder
assert
all
(
len
(
chunks
[
i
][
1
])
<=
len
(
chunks
[
i
-
1
][
1
])
for
i
in
range
(
1
,
len
(
chunks
))
)
for
x
in
chunks
:
output
.
append
(
x
[
1
])
# check indices
reordered_output
=
loglikelihoods
.
get_original
(
output
)
assert
reordered_output
==
[
x
[
1
]
for
x
in
loglikelihood_samples
]
tests/tests_master/test_description.py
View file @
cda25fef
import
random
import
lm_eval.tasks
import
lm_eval.models
import
lm_eval.tasks
def
test_description
():
...
...
@@ -14,7 +15,6 @@ def test_description():
task_dict
=
lm_eval
.
tasks
.
get_task_dict
(
task_names
)
for
task_name
,
task
in
task_dict
.
items
():
# patch description field in task (# TODO: make this much more cleaned up)
task
.
_config
.
description
=
description_dict
[
task_name
]
...
...
tests/tests_master/test_generate_13_grams.py
View file @
cda25fef
import
glob
import
logging
import
os
from
collections
import
Counter
import
shutil
import
glob
from
collections
import
Counter
from
lm_eval.decontamination.archiver
import
Archive
,
TextReader
from
lm_eval.decontamination.janitor
import
Janitor
,
word_ngrams
from
scripts.clean_training_data.generate_13_grams
import
do_ngrams_in_buckets
from
lm_eval.decontamination.archiver
import
Archive
,
TextReader
import
logging
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog):
print
(
"rebuild"
)
rebuilt_ngrams
=
[]
bucket_file_paths
=
glob
.
glob
(
os
.
path
.
join
(
test_working_directory
,
"output"
,
f
"*.bkt.txt"
)
os
.
path
.
join
(
test_working_directory
,
"output"
,
"*.bkt.txt"
)
)
for
bucket_file_path
in
bucket_file_paths
:
reader
=
TextReader
(
bucket_file_path
)
...
...
tests/tests_master/test_models.py
View file @
cda25fef
...
...
@@ -2,12 +2,13 @@ import hashlib
import
json
import
os
import
pickle
import
pytest
import
unittest.mock
as
mock
import
pytest
from
openai
import
OpenAI
import
lm_eval.models
as
models
from
openai
import
OpenAI
client
=
OpenAI
()
...
...
tests/tests_master/test_version_stable.py
View file @
cda25fef
import
lm_eval.tasks
as
tasks
import
lm_eval.models
as
models
import
lm_eval.evaluator
as
evaluator
import
collections
import
hashlib
import
json
import
os
import
random
import
pytest
import
os
import
json
import
hashlib
import
collection
s
import
lm_eval.evaluator
as
evaluator
import
lm_eval.models
as
models
import
lm_eval.tasks
as
task
s
os
.
makedirs
(
"tests/testdata"
,
exist_ok
=
True
)
...
...
tests/utils.py
View file @
cda25fef
from
typing
import
List
from
lm_eval.utils
import
load_yaml_config
from
pathlib
import
Path
from
typing
import
Union
import
os
from
pathlib
import
Path
from
typing
import
List
,
Union
from
lm_eval.utils
import
load_yaml_config
# {{{CI}}}
...
...
Prev
1
…
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment