Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
d95a4333
Commit
d95a4333
authored
May 02, 2022
by
Fabrizio Milo
Browse files
fix codespell
parent
121b7096
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
39 additions
and
36 deletions
+39
-36
.pre-commit-config.yaml
.pre-commit-config.yaml
+5
-5
ignore.txt
ignore.txt
+3
-0
lm_eval/base.py
lm_eval/base.py
+8
-8
lm_eval/datasets/drop/drop.py
lm_eval/datasets/drop/drop.py
+1
-1
lm_eval/datasets/sat_analogies/sat_analogies.py
lm_eval/datasets/sat_analogies/sat_analogies.py
+1
-1
lm_eval/decontamination/janitor.py
lm_eval/decontamination/janitor.py
+2
-2
lm_eval/models/gpt2.py
lm_eval/models/gpt2.py
+1
-1
lm_eval/models/gpt3.py
lm_eval/models/gpt3.py
+6
-6
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+3
-3
lm_eval/tasks/hendrycks_ethics.py
lm_eval/tasks/hendrycks_ethics.py
+2
-2
scripts/clean_training_data/README.md
scripts/clean_training_data/README.md
+2
-2
scripts/clean_training_data/process_sorted_buckets.py
scripts/clean_training_data/process_sorted_buckets.py
+1
-1
templates/new_multiple_choice_task.py
templates/new_multiple_choice_task.py
+1
-1
templates/new_task.py
templates/new_task.py
+3
-3
No files found.
.pre-commit-config.yaml
View file @
d95a4333
...
@@ -35,8 +35,8 @@ repos:
...
@@ -35,8 +35,8 @@ repos:
rev
:
v2.1.0
rev
:
v2.1.0
hooks
:
hooks
:
-
id
:
codespell
-
id
:
codespell
args
:
[
exclude
:
>
"
--ignore-words-list=reord"
,
# Word used in error messages that need rewording
(?x)^(
--check-filenames
,
.*\.json|ignore.txt
--check-hidden
,
)$
]
args
:
[
--check-filenames
,
--check-hidden
,
--ignore-words=ignore.txt
]
ignore.txt
0 → 100644
View file @
d95a4333
ROUGE
rouge
nin
lm_eval/base.py
View file @
d95a4333
...
@@ -51,7 +51,7 @@ class LM(abc.ABC):
...
@@ -51,7 +51,7 @@ class LM(abc.ABC):
- We will use the full max context length of the model.
- We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
the max context length.
the max context length.
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementa
i
tons
- IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementat
i
ons
which may simply concatenate multiple documents together.
which may simply concatenate multiple documents together.
- IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
- IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
multiple chunks, the last input will still a full-sized context.
multiple chunks, the last input will still a full-sized context.
...
@@ -234,9 +234,9 @@ class BaseLM(LM):
...
@@ -234,9 +234,9 @@ class BaseLM(LM):
return
-
len
(
toks
),
tuple
(
toks
)
return
-
len
(
toks
),
tuple
(
toks
)
# TODO: automatic (variable) batch size detection for vectorization
# TODO: automatic (variable) batch size detection for vectorization
reord
=
utils
.
Reorderer
(
requests
,
_collate
)
re
_
ord
=
utils
.
Reorderer
(
requests
,
_collate
)
for
chunk
in
utils
.
chunks
(
for
chunk
in
utils
.
chunks
(
tqdm
(
reord
.
get_reordered
(),
disable
=
disable_tqdm
),
self
.
batch_size
tqdm
(
re
_
ord
.
get_reordered
(),
disable
=
disable_tqdm
),
self
.
batch_size
):
):
inps
=
[]
inps
=
[]
cont_toks_list
=
[]
cont_toks_list
=
[]
...
@@ -327,10 +327,10 @@ class BaseLM(LM):
...
@@ -327,10 +327,10 @@ class BaseLM(LM):
res
.
append
(
answer
)
res
.
append
(
answer
)
return
reord
.
get_original
(
res
)
return
re
_
ord
.
get_original
(
res
)
def
greedy_until
(
self
,
requests
):
def
greedy_until
(
self
,
requests
):
# TODO: implement fully general `until` that handles until
s
that are
# TODO: implement fully general `until` that handles until that are
# multiple tokens or that span multiple tokens correctly
# multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM?
# TODO: extract to TokenizedLM?
...
@@ -340,9 +340,9 @@ class BaseLM(LM):
...
@@ -340,9 +340,9 @@ class BaseLM(LM):
toks
=
self
.
tok_encode
(
x
[
0
])
toks
=
self
.
tok_encode
(
x
[
0
])
return
len
(
toks
),
x
[
0
]
return
len
(
toks
),
x
[
0
]
reord
=
utils
.
Reorderer
(
requests
,
_collate
)
re
_
ord
=
utils
.
Reorderer
(
requests
,
_collate
)
for
context
,
until
in
tqdm
(
reord
.
get_reordered
()):
for
context
,
until
in
tqdm
(
re
_
ord
.
get_reordered
()):
if
isinstance
(
until
,
str
):
if
isinstance
(
until
,
str
):
until
=
[
until
]
until
=
[
until
]
...
@@ -366,7 +366,7 @@ class BaseLM(LM):
...
@@ -366,7 +366,7 @@ class BaseLM(LM):
res
.
append
(
s
)
res
.
append
(
s
)
return
reord
.
get_original
(
res
)
return
re
_
ord
.
get_original
(
res
)
class
Task
(
abc
.
ABC
):
class
Task
(
abc
.
ABC
):
...
...
lm_eval/datasets/drop/drop.py
View file @
d95a4333
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
#
#
# Custom DROP dataet that, unlike HF, keeps all question-answer pairs
# Custom DROP data
s
et that, unlike HF, keeps all question-answer pairs
# even if there are multiple types of answers for the same question.
# even if there are multiple types of answers for the same question.
"""DROP dataset."""
"""DROP dataset."""
...
...
lm_eval/datasets/sat_analogies/sat_analogies.py
View file @
d95a4333
...
@@ -61,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
...
@@ -61,7 +61,7 @@ class SatAnalogies(datasets.GeneratorBasedBuilder):
return
(
return
(
"To use SAT Analogy Questions you have to download it manually. Please "
"To use SAT Analogy Questions you have to download it manually. Please "
"email Peter Turney to request the data (https://www.apperceptual.com). "
"email Peter Turney to request the data (https://www.apperceptual.com). "
"Once you rec
i
eve a download link for the dataset, supply the local path "
"Once you rece
i
ve a download link for the dataset, supply the local path "
"as the `data_dir` arg: "
"as the `data_dir` arg: "
"`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
"`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
)
)
...
...
lm_eval/decontamination/janitor.py
View file @
d95a4333
...
@@ -158,7 +158,7 @@ class Janitor:
...
@@ -158,7 +158,7 @@ class Janitor:
def
clean
(
self
,
dirty_string
):
def
clean
(
self
,
dirty_string
):
"""Clean a string (e.g. a training set) by removing all ngrams previously
"""Clean a string (e.g. a training set) by removing all ngrams previously
re
i
gstered as contaminants. Returns a list of clean chunks, or empty if
reg
i
stered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty"""
the string was too dirty"""
if
JANITOR_CPP
:
if
JANITOR_CPP
:
return
self
.
clean_cpp
(
dirty_string
)
return
self
.
clean_cpp
(
dirty_string
)
...
@@ -275,7 +275,7 @@ class Janitor:
...
@@ -275,7 +275,7 @@ class Janitor:
# ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
# ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958. At first,
# oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
# oil money had a marginal impact. A few lowrise concete buildings were erected, and the first
# paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
# paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
# would last, took a cautious approach, prefering to save the revenue rather than investing it in
# would last, took a cautious approach, prefer
r
ing to save the revenue rather than investing it in
# development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
# development. His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
# to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
# to transform Abu Dhabi. The ruling Al Nahayan family decided that Sheikh Zayed should replace his
# brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
# brother as Ruler and carry out his vision of developing the country. On [[August 6]], [[1966]],
...
...
lm_eval/models/gpt2.py
View file @
d95a4333
...
@@ -25,7 +25,7 @@ class HFLM(BaseLM):
...
@@ -25,7 +25,7 @@ class HFLM(BaseLM):
self
.
_device
=
torch
.
device
(
device
)
self
.
_device
=
torch
.
device
(
device
)
print
(
f
"Using device '
{
device
}
'"
)
print
(
f
"Using device '
{
device
}
'"
)
else
:
else
:
print
(
"Device not specifi
c
ed"
)
print
(
"Device not specified"
)
print
(
f
"Cuda Available?
{
torch
.
cuda
.
is_available
()
}
"
)
print
(
f
"Cuda Available?
{
torch
.
cuda
.
is_available
()
}
"
)
self
.
_device
=
(
self
.
_device
=
(
torch
.
device
(
"cuda"
)
torch
.
device
(
"cuda"
)
...
...
lm_eval/models/gpt3.py
View file @
d95a4333
...
@@ -124,10 +124,10 @@ class GPT3LM(BaseLM):
...
@@ -124,10 +124,10 @@ class GPT3LM(BaseLM):
toks
=
x
[
1
]
+
x
[
2
]
toks
=
x
[
1
]
+
x
[
2
]
return
-
len
(
toks
),
tuple
(
toks
)
return
-
len
(
toks
),
tuple
(
toks
)
reord
=
utils
.
Reorderer
(
requests
,
_collate
)
re
_
ord
=
utils
.
Reorderer
(
requests
,
_collate
)
for
chunk
in
tqdm
(
for
chunk
in
tqdm
(
list
(
utils
.
chunks
(
reord
.
get_reordered
(),
self
.
REQ_CHUNK_SIZE
)),
list
(
utils
.
chunks
(
re
_
ord
.
get_reordered
(),
self
.
REQ_CHUNK_SIZE
)),
disable
=
disable_tqdm
,
disable
=
disable_tqdm
,
):
):
inps
=
[]
inps
=
[]
...
@@ -163,7 +163,7 @@ class GPT3LM(BaseLM):
...
@@ -163,7 +163,7 @@ class GPT3LM(BaseLM):
if
cache_key
is
not
None
:
if
cache_key
is
not
None
:
self
.
cache_hook
.
add_partial
(
"loglikelihood"
,
cache_key
,
answer
)
self
.
cache_hook
.
add_partial
(
"loglikelihood"
,
cache_key
,
answer
)
return
reord
.
get_original
(
res
)
return
re
_
ord
.
get_original
(
res
)
def
greedy_until
(
self
,
requests
):
def
greedy_until
(
self
,
requests
):
if
not
requests
:
if
not
requests
:
...
@@ -174,7 +174,7 @@ class GPT3LM(BaseLM):
...
@@ -174,7 +174,7 @@ class GPT3LM(BaseLM):
toks
=
self
.
tok_encode
(
x
[
0
])
toks
=
self
.
tok_encode
(
x
[
0
])
return
len
(
toks
),
x
[
0
]
return
len
(
toks
),
x
[
0
]
reord
=
utils
.
Reorderer
(
requests
,
_collate
)
re
_
ord
=
utils
.
Reorderer
(
requests
,
_collate
)
def
sameuntil_chunks
(
xs
,
size
):
def
sameuntil_chunks
(
xs
,
size
):
ret
=
[]
ret
=
[]
...
@@ -191,7 +191,7 @@ class GPT3LM(BaseLM):
...
@@ -191,7 +191,7 @@ class GPT3LM(BaseLM):
# todo: more intelligent batching for heterogeneous `until`
# todo: more intelligent batching for heterogeneous `until`
for
chunk
,
until
in
tqdm
(
for
chunk
,
until
in
tqdm
(
list
(
sameuntil_chunks
(
reord
.
get_reordered
(),
self
.
REQ_CHUNK_SIZE
))
list
(
sameuntil_chunks
(
re
_
ord
.
get_reordered
(),
self
.
REQ_CHUNK_SIZE
))
):
):
inps
=
[]
inps
=
[]
for
context
,
_
in
chunk
:
for
context
,
_
in
chunk
:
...
@@ -219,7 +219,7 @@ class GPT3LM(BaseLM):
...
@@ -219,7 +219,7 @@ class GPT3LM(BaseLM):
res
.
append
(
s
)
res
.
append
(
s
)
return
reord
.
get_original
(
res
)
return
re
_
ord
.
get_original
(
res
)
def
_model_call
(
self
,
inps
):
def
_model_call
(
self
,
inps
):
# Isn't used because we override _loglikelihood_tokens
# Isn't used because we override _loglikelihood_tokens
...
...
lm_eval/tasks/drop.py
View file @
d95a4333
...
@@ -74,16 +74,16 @@ class DROP(Task):
...
@@ -74,16 +74,16 @@ class DROP(Task):
{"number": ['1', '8'], ...}
{"number": ['1', '8'], ...}
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
"""
"""
vas
=
[]
va
lid_answer
s
=
[]
for
i
in
range
(
len
(
validated_answers
[
"number"
])):
for
i
in
range
(
len
(
validated_answers
[
"number"
])):
vas
.
append
(
va
lid_answer
s
.
append
(
{
{
"number"
:
validated_answers
[
"number"
][
i
],
"number"
:
validated_answers
[
"number"
][
i
],
"date"
:
validated_answers
[
"date"
][
i
],
"date"
:
validated_answers
[
"date"
][
i
],
"spans"
:
validated_answers
[
"spans"
][
i
],
"spans"
:
validated_answers
[
"spans"
][
i
],
}
}
)
)
return
vas
return
va
lid_answer
s
answers
=
[]
answers
=
[]
answers_set
=
set
()
answers_set
=
set
()
...
...
lm_eval/tasks/hendrycks_ethics.py
View file @
d95a4333
...
@@ -10,7 +10,7 @@ to steer chatbot outputs or eventually regularize open-ended reinforcement
...
@@ -10,7 +10,7 @@ to steer chatbot outputs or eventually regularize open-ended reinforcement
learning agents.
learning agents.
NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
NOTE: The reported "group" accuracies for the Deontology, Justice, and Virtue
tasks are refered to in this work as the `em` sub-metric. See Section 3. Metrics.
tasks are refer
r
ed to in this work as the `em` sub-metric. See Section 3. Metrics.
of the paper.
of the paper.
Homepage: https://github.com/hendrycks/ethics
Homepage: https://github.com/hendrycks/ethics
...
@@ -323,7 +323,7 @@ class EthicsUtilitarianism(Ethics):
...
@@ -323,7 +323,7 @@ class EthicsUtilitarianism(Ethics):
}
}
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"Scenario 1: {}
\n
Scenario 2: {}
\n
Question: Is Scenario 1 prefer
r
able?
\n
Answer:"
.
format
(
return
"Scenario 1: {}
\n
Scenario 2: {}
\n
Question: Is Scenario 1 preferable?
\n
Answer:"
.
format
(
doc
[
"scenarios"
][
0
],
doc
[
"scenarios"
][
1
]
doc
[
"scenarios"
][
0
],
doc
[
"scenarios"
][
1
]
)
)
...
...
scripts/clean_training_data/README.md
View file @
d95a4333
...
@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
...
@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
1) Collects all contamination text files that are to be removed from training data
1) Collects all contamination text files that are to be removed from training data
2) Filters training data by finding
`N`
gram matches between the training data
2) Filters training data by finding
`N`
gram matches between the training data
and any contamination
and any contamination
1)
`N`
grams ignore case and punctation and are split on whitespace.
1)
`N`
grams ignore case and punct
u
ation and are split on whitespace.
2) Matching
`N`
gram substrings are removed, as is a
`window_to_remove`
character window around
2) Matching
`N`
gram substrings are removed, as is a
`window_to_remove`
character window around
the match, splitting the training data into chunks
the match, splitting the training data into chunks
3) Any chunks less than
`minimum_slice_length`
are removed
3) Any chunks less than
`minimum_slice_length`
are removed
...
@@ -20,7 +20,7 @@ minimum_slice_length = 200
...
@@ -20,7 +20,7 @@ minimum_slice_length = 200
too_dirty_cutoff = 10
too_dirty_cutoff = 10
```
```
## Compling
## Comp
i
ling
Janitor can be used as a pure python program, but it is much faster if the ngram
Janitor can be used as a pure python program, but it is much faster if the ngram
code is run in C++. To compile the C++ code, run
code is run in C++. To compile the C++ code, run
...
...
scripts/clean_training_data/process_sorted_buckets.py
View file @
d95a4333
...
@@ -63,7 +63,7 @@ def process_bucket(
...
@@ -63,7 +63,7 @@ def process_bucket(
for
line
in
bucket
.
read
():
for
line
in
bucket
.
read
():
[
ngram
,
document_id
]
=
line
.
rsplit
(
" "
,
1
)
[
ngram
,
document_id
]
=
line
.
rsplit
(
" "
,
1
)
# Write ngram if more then 10 unique document occurences
# Write ngram if more then 10 unique document occur
r
ences
if
ngram
!=
current_ngram
:
if
ngram
!=
current_ngram
:
if
len
(
current_ngram_document_ids
)
>
10
:
if
len
(
current_ngram_document_ids
)
>
10
:
output_archive
.
add_data
(
output_archive
.
add_data
(
...
...
templates/new_multiple_choice_task.py
View file @
d95a4333
# TODO: Remove all TODO comments once the implementation is complete.
# TODO: Remove all TODO comments once the implementation is complete.
"""
"""
TODO: Add the Paper Title on this line.
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (prefer
r
ably from arXiv) on this line.
TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
TODO: Write a Short Description of the task.
TODO: Write a Short Description of the task.
...
...
templates/new_task.py
View file @
d95a4333
# TODO: Remove all TODO comments once the implementation is complete.
# TODO: Remove all TODO comments once the implementation is complete.
"""
"""
TODO: Add the Paper Title on this line.
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (prefer
r
ably from arXiv) on this line.
TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
TODO: Write a Short Description of the task.
TODO: Write a Short Description of the task.
...
@@ -45,7 +45,7 @@ class NewTask(Task):
...
@@ -45,7 +45,7 @@ class NewTask(Task):
if
self
.
_training_docs
is
None
:
if
self
.
_training_docs
is
None
:
# TODO: Return the training document generator from `self.dataset`.
# TODO: Return the training document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with
# If you need to process the data, `map` over the documents with
# the custom procesing function, `self._process_doc`. E.g.
# the custom proces
s
ing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"train"`.
# named differently than the default `"train"`.
...
@@ -56,7 +56,7 @@ class NewTask(Task):
...
@@ -56,7 +56,7 @@ class NewTask(Task):
if
self
.
has_validation_docs
():
if
self
.
has_validation_docs
():
# TODO: Return the validation document generator from `self.dataset`.
# TODO: Return the validation document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the
# If you need to process the data, `map` over the documents with the
# custom procesing function, `self._process_doc`. E.g.
# custom proces
s
ing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"validation"`.
# named differently than the default `"validation"`.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment