Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
11f614b0
Unverified
Commit
11f614b0
authored
Apr 30, 2022
by
Stella Biderman
Committed by
GitHub
Apr 30, 2022
Browse files
Merge branch 'master' into task_doc
parents
0a6a9b7e
e00d682f
Changes
129
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
326 additions
and
25 deletions
+326
-25
scripts/clean_training_data/investigate_pile.py
scripts/clean_training_data/investigate_pile.py
+79
-0
scripts/clean_training_data/sort_13_gram_buckets.py
scripts/clean_training_data/sort_13_gram_buckets.py
+5
-11
setup.py
setup.py
+4
-5
templates/new_multiple_choice_task.py
templates/new_multiple_choice_task.py
+78
-0
templates/new_task.py
templates/new_task.py
+141
-0
tests/test_generate_13_grams.py
tests/test_generate_13_grams.py
+16
-8
tests/test_janitor.py
tests/test_janitor.py
+1
-1
tests/testdata/swag-v0-loglikelihood
tests/testdata/swag-v0-loglikelihood
+1
-0
tests/testdata/swag-v0-res.json
tests/testdata/swag-v0-res.json
+1
-0
No files found.
scripts/clean_training_data/investigate_pile.py
0 → 100644
View file @
11f614b0
from
lm_eval.decontamination.archiver
import
Reader
import
os
import
json
from
functools
import
reduce
import
glob
import
tqdm
from
tqdm_multiprocess
import
TqdmMultiProcessPool
def
get_file_stats
(
file_path
,
tqdm_func
,
global_tqdm
):
reader
=
Reader
()
total_documents
=
0
total_size
=
0
update_frequency
=
10000
current_file_position
=
0
with
tqdm_func
(
total
=
os
.
path
.
getsize
(
file_path
),
dynamic_ncols
=
True
,
unit
=
"byte"
,
unit_scale
=
1
)
as
progress
:
for
document
in
reader
.
read
(
file_path
,
get_meta
=
True
):
total_size
+=
len
(
document
)
total_documents
+=
1
if
total_documents
%
update_frequency
==
0
:
new_file_pos
=
reader
.
fh
.
tell
()
bytes_read
=
new_file_pos
-
current_file_position
current_file_position
=
new_file_pos
progress
.
update
(
bytes_read
)
global_tqdm
.
update
(
bytes_read
)
return
(
total_documents
,
total_size
)
def
get_files
():
directory
=
"pile"
files
=
list
(
sorted
(
glob
.
glob
(
os
.
path
.
join
(
directory
,
"*.jsonl.zst*"
))))
print
(
files
)
return
files
def
get_stats
():
files
=
get_files
()
total_size_bytes
=
sum
(
map
(
lambda
x
:
os
.
path
.
getsize
(
x
),
files
))
pool
=
TqdmMultiProcessPool
(
4
)
global_tqdm
=
tqdm
.
tqdm
(
total
=
total_size_bytes
,
dynamic_ncols
=
True
,
unit
=
"byte"
,
unit_scale
=
1
)
# Generate minhashes with pool
tasks
=
[(
get_file_stats
,
(
file
,))
for
file
in
files
]
on_done
=
lambda
_
:
None
on_error
=
lambda
_
:
None
results
=
pool
.
map
(
global_tqdm
,
tasks
,
on_error
,
on_done
)
total_documents
,
total_size
=
reduce
(
lambda
x
,
y
:
(
x
[
0
]
+
y
[
0
],
x
[
1
]
+
y
[
1
]),
results
)
start_offsets
=
[]
current_offset
=
0
for
file_document_count
,
_
in
results
:
start_offsets
.
append
(
current_offset
)
current_offset
+=
file_document_count
return
(
total_documents
,
total_size
,
start_offsets
)
if
__name__
==
'__main__'
:
version
=
1.01
print
(
f
"Running version
{
version
}
"
)
stats_file_path
=
"pile_statistics.json"
if
os
.
path
.
exists
(
stats_file_path
):
stats
=
json
.
load
(
open
(
stats_file_path
,
"r"
))
else
:
document_count
,
total_document_size_chars
,
start_offsets
=
get_stats
()
stats
=
{
"Data"
:
"Pile statistics"
,
"Document Count"
:
document_count
,
"Total Pile Characters"
:
total_document_size_chars
,
"File Start Offsets"
:
start_offsets
}
json
.
dump
(
stats
,
open
(
stats_file_path
,
"w"
),
indent
=
4
)
print
(
f
"document_count:
{
stats
[
'Document Count'
]
}
"
)
print
(
f
"total_chars:
{
stats
[
'Total Pile Characters'
]
}
"
)
print
(
f
"start_offsets:
{
stats
[
'File Start Offsets'
]
}
"
)
scripts/clean_training_data/sort_13_gram_buckets.py
View file @
11f614b0
"""
Iteratively runs gnu sort on each bucket,
gnu handles the multiprocessing
.
Iteratively runs gnu sort on each bucket,
uses up to 8 cores
.
Arguments
---------
...
...
@@ -11,10 +11,8 @@ Arguments
import
glob
import
argparse
import
os
from
pathlib
import
Path
import
signal
from
signal
import
SIGINT
import
re
import
subprocess
from
tqdm
import
tqdm
...
...
@@ -32,12 +30,6 @@ def sort_13_gram_buckets(working_directory):
bucket_file_paths
=
glob
.
glob
(
os
.
path
.
join
(
working_directory
,
f
"*.bkt.txt"
))
for
bucket_file_path
in
tqdm
(
bucket_file_paths
,
dynamic_ncols
=
True
):
bucket_id
=
re
.
sub
(
"\D"
,
""
,
os
.
path
.
basename
(
bucket_file_path
))
done_file
=
os
.
path
.
join
(
working_directory
,
f
"ngram_bucket_sorting_
{
bucket_id
}
.done"
)
if
os
.
path
.
exists
(
done_file
):
logger
.
info
(
f
"bucket
{
bucket_id
}
already processed, skipping"
)
return
sorted_file_path
=
bucket_file_path
+
".sorted"
command
=
f
"sort
{
bucket_file_path
}
>
{
sorted_file_path
}
"
logger
.
info
(
command
)
...
...
@@ -46,7 +38,6 @@ def sort_13_gram_buckets(working_directory):
if
terminate
:
return
Path
(
done_file
).
touch
()
os
.
remove
(
bucket_file_path
)
parser
=
argparse
.
ArgumentParser
(
description
=
'sort 13gram buckets'
)
...
...
@@ -54,6 +45,9 @@ parser.add_argument("-dir", "--working_directory", default="")
if
__name__
==
'__main__'
:
version
=
1.00
print
(
f
"Running version
{
version
}
"
)
# Handle sigint (ctrl-c) cleanly
previous_signal_int
=
signal
.
signal
(
SIGINT
,
handler
)
...
...
@@ -61,4 +55,4 @@ if __name__ == '__main__':
setup_logger_tqdm
(
logfile_path
)
args
=
parser
.
parse_args
()
sort_13_gram_buckets
(
args
.
working_directory
)
\ No newline at end of file
sort_13_gram_buckets
(
args
.
working_directory
)
setup.py
View file @
11f614b0
...
...
@@ -20,9 +20,7 @@ setuptools.setup(
],
python_requires
=
'>=3.6'
,
install_requires
=
[
"black"
,
"best_download==0.0.9"
,
"datasets==1.15.1"
,
"datasets>=2.0.0"
,
"click>=7.1"
,
"scikit-learn>=0.24.1"
,
"torch>=1.7"
,
...
...
@@ -34,7 +32,6 @@ setuptools.setup(
"pycountry==20.7.3"
,
"numexpr==2.7.2"
,
"lm_dataformat==0.0.20"
,
"pytest==6.2.3"
,
"pybind11==2.6.2"
,
"tqdm-multiprocess==0.0.11"
,
"zstandard==0.15.2"
,
...
...
@@ -43,8 +40,10 @@ setuptools.setup(
"openai==0.6.4"
,
"jieba==0.42.1"
,
"nagisa==0.2.7"
,
"bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
],
dependency_links
=
[
"https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
,
]
],
extras_require
=
{
'dev'
:
[
'pytest'
,
'black'
]}
)
templates/new_multiple_choice_task.py
0 → 100644
View file @
11f614b0
# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
TODO: Write a Short Description of the task.
Homepage: TODO: Add the URL to the task's Homepage here.
"""
from
lm_eval.base
import
MultipleChoiceTask
# TODO: Add the BibTeX citation for the task.
_CITATION
=
"""
"""
# TODO: Replace `NewTask` with the name of your Task.
class
NewTask
(
MultipleChoiceTask
):
VERSION
=
0
# TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
# dataset as denoted in HuggingFace `datasets`.
DATASET_PATH
=
""
# TODO: Add the `DATASET_NAME` string. This is the name of a subset within
# `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`.
DATASET_NAME
=
None
def
has_training_docs
(
self
):
# TODO: Fill in the return with `True` if the Task has training data; else `False`.
return
False
def
has_validation_docs
(
self
):
# TODO: Fill in the return with `True` if the Task has validation data; else `False`.
return
False
def
has_test_docs
(
self
):
# TODO: Fill in the return with `True` if the Task has test data; else `False`.
return
False
def
training_docs
(
self
):
if
self
.
has_training_docs
():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if
self
.
_training_docs
is
None
:
# TODO: Return the training document generator from `self.dataset`.
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"train"`.
self
.
_training_docs
=
list
(
map
(
self
.
_process_doc
,
self
.
dataset
[
"train"
])
)
return
self
.
_training_docs
def
validation_docs
(
self
):
if
self
.
has_validation_docs
():
# TODO: Return the validation document generator from `self.dataset`.
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"validation"`.
return
map
(
self
.
_process_doc
,
self
.
dataset
[
"validation"
])
def
test_docs
(
self
):
if
self
.
has_test_docs
():
# TODO: Return the test document generator from `self.dataset`.
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"test"`.
return
map
(
self
.
_process_doc
,
self
.
dataset
[
"test"
])
def
_process_doc
(
self
,
doc
):
# TODO: Process the documents into a dictionary with the following keys:
return
{
"query"
:
""
,
# The query prompt.
"choices"
:
[],
# The list of choices.
"gold"
:
0
,
# The integer used to index into the correct element of `"choices"`.
}
def
doc_to_text
(
self
,
doc
):
# TODO: Format the query prompt portion of the document example.
return
doc
[
"query"
]
templates/new_task.py
0 → 100644
View file @
11f614b0
# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
TODO: Write a Short Description of the task.
Homepage: TODO: Add the URL to the task's Homepage here.
"""
from
lm_eval.base
import
Task
# TODO: Add the BibTeX citation for the task.
_CITATION
=
"""
"""
# TODO: Replace `NewTask` with the name of your Task.
class
NewTask
(
Task
):
VERSION
=
0
# TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
# dataset as denoted in HuggingFace `datasets`.
DATASET_PATH
=
""
# TODO: Add the `DATASET_NAME` string. This is the name of a subset within
# `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`.
DATASET_NAME
=
None
def
has_training_docs
(
self
):
# TODO: Fill in the return with `True` if the Task has training data; else `False`.
return
False
def
has_validation_docs
(
self
):
# TODO: Fill in the return with `True` if the Task has validation data; else `False`.
return
False
def
has_test_docs
(
self
):
# TODO: Fill in the return with `True` if the Task has test data; else `False`.
return
False
def
training_docs
(
self
):
if
self
.
has_training_docs
():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if
self
.
_training_docs
is
None
:
# TODO: Return the training document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with
# the custom procesing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"train"`.
self
.
_training_docs
=
list
(
self
.
dataset
[
"train"
])
return
self
.
_training_docs
def
validation_docs
(
self
):
if
self
.
has_validation_docs
():
# TODO: Return the validation document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the
# custom procesing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"validation"`.
return
self
.
dataset
[
"validation"
]
def
test_docs
(
self
):
if
self
.
has_test_docs
():
# TODO: Return the test document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the
# custom processing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["test"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"test"`.
return
self
.
dataset
[
"test"
]
def
_process_doc
(
self
,
doc
):
# TODO: Process (detokenize, strip, replace etc.) each individual `doc`
# with this function. You can map this across the docs in each available
# dataset split. See the TODOs in `train_docs`, `validation_docs`, and
# `test_docs` for snippets.
# NOTE: DELETE THIS FUNCTION IF UNUSED.
return
doc
def
doc_to_text
(
self
,
doc
):
# TODO: Format the query prompt portion of the document example.
return
""
def
doc_to_target
(
self
,
doc
):
# TODO: Fill in the `target` ("gold answer") variable.
# The prepended `" "` is required to space out the `doc_to_text` and
# `doc_to_target` strings.
target
=
""
return
" "
+
target
def
construct_requests
(
self
,
doc
,
ctx
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or
test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: Construct your language model requests with the request factory, `rf`,
# and return them as an iterable.
return
[]
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: For each (sub)metric in the task evaluation, add a key-value pair
# with the metric name as key and the corresponding metric result as value
# for the current `doc`.
return
{}
def
aggregation
(
self
):
"""
:returns: {str: [metric_score] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metric scores
"""
# TODO: For each (sub)metric in the task evaluation, add a key-value pair
# with the metric name as key and an aggregation function as value which
# determines how to combine results from each document in the dataset.
# Check `lm_eval.metrics` to find built-in aggregation functions.
return
{}
def
higher_is_better
(
self
):
# TODO: For each (sub)metric in the task evaluation, add a key-value pair
# with the metric name as key and a `bool` value determining whether or
# not higher values of that metric are deemed better.
return
{}
tests/test_generate_13_grams.py
View file @
11f614b0
...
...
@@ -3,12 +3,14 @@ from collections import Counter
import
shutil
import
glob
from
scripts.clean_training_data
.janitor
import
*
from
lm_eval.decontamination
.janitor
import
*
from
scripts.clean_training_data.generate_13_grams
import
do_ngrams_in_buckets
from
scripts.clean_training_data
.archiver
import
Archive
,
TextReader
from
lm_eval.decontamination
.archiver
import
Archive
,
TextReader
import
logging
logger
=
logging
.
getLogger
(
__name__
)
def
test_generate_13_grams_1
():
def
test_generate_13_grams_1
(
caplog
):
data
=
"""A goose (plural geese) is a bird of any of several waterfowl species in the family Anatidae.
This group comprises the genera Anser (the grey geese and white geese) and Branta (the black geese).
Some other birds, mostly related to the shelducks, have "goose" as part of their names.
...
...
@@ -22,6 +24,7 @@ def test_generate_13_grams_1():
data
=
data
+
data
# Simple Generation
print
(
"simple generation"
)
n
=
13
janitor
=
Janitor
()
ngrams
=
word_ngrams
(
janitor
.
normalize_string
(
data
),
n
)
...
...
@@ -31,22 +34,26 @@ def test_generate_13_grams_1():
# print(comparison)
# Generating into buckets
print
(
"bucket generation"
)
test_working_directory
=
"test_generate_13_grams"
output_directory
=
os
.
path
.
join
(
test_working_directory
,
"output"
)
try
:
shutil
.
rmtree
(
output
_directory
)
shutil
.
rmtree
(
test_working
_directory
)
except
FileNotFoundError
:
pass
os
.
makedirs
(
test_working_directory
,
exist_ok
=
True
)
archive
=
Archive
(
os
.
path
.
join
(
test_working_directory
,
"test.jsonl.zst"
))
os
.
makedirs
(
test_working_directory
)
assert
(
not
os
.
path
.
exists
(
"pile"
))
os
.
makedirs
(
"pile"
)
archive
=
Archive
(
os
.
path
.
join
(
"pile"
,
"test.jsonl.zst"
))
archive
.
add_data
(
data
)
archive
.
commit
()
bucket_count
=
4
do_ngrams_in_buckets
(
n
,
test_working_directory
,
bucket_count
)
# Rebuild from buckets
print
(
"rebuild"
)
rebuilt_ngrams
=
[]
bucket_file_paths
=
glob
.
glob
(
os
.
path
.
join
(
test_working_directory
,
"output"
,
f
"*.bkt.txt"
))
for
bucket_file_path
in
bucket_file_paths
:
reader
=
TextReader
(
bucket_file_path
)
...
...
@@ -56,6 +63,7 @@ def test_generate_13_grams_1():
rebuilt_ngrams
.
append
(
ngram
)
# Compare
print
(
"compare"
)
result_counter
=
Counter
(
rebuilt_ngrams
)
# print(len(result_counter))
# print(len(comparison_counter))
...
...
tests/test_janitor.py
View file @
11f614b0
import
re
from
collections
import
defaultdict
from
scripts.clean_training_data
.janitor
import
*
from
lm_eval.decontamination
.janitor
import
*
def
simple_ngram
(
sequence
,
n
):
ngrams
=
list
()
...
...
tests/testdata/swag-v0-loglikelihood
0 → 100644
View file @
11f614b0
be4fcbad876124c4ba3c71970538a97fec0e36a9cc677c70b6c9243a7bcee0ec
\ No newline at end of file
tests/testdata/swag-v0-res.json
0 → 100644
View file @
11f614b0
{
"results"
:
{
"swag"
:
{
"acc"
:
0.2482255323402979
,
"acc_norm"
:
0.24882535239428172
,
"acc_norm_stderr"
:
0.00305666959496067
,
"acc_stderr"
:
0.003054201832644171
}},
"versions"
:
{
"swag"
:
0
}}
\ No newline at end of file
Prev
1
…
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment