Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
d986fd3c
Commit
d986fd3c
authored
Feb 28, 2021
by
Jon Tow
Browse files
Add proper normalization, checksum, and formatting
parents
805b541c
879aabd6
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
270 additions
and
42 deletions
+270
-42
lm_eval/models/dummy.py
lm_eval/models/dummy.py
+2
-1
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+9
-1
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+75
-34
lm_eval/tasks/translation.py
lm_eval/tasks/translation.py
+0
-6
lm_eval/tasks/unscramble.py
lm_eval/tasks/unscramble.py
+101
-0
scripts/fewshot_description_experiment.py
scripts/fewshot_description_experiment.py
+83
-0
No files found.
lm_eval/models/dummy.py
View file @
d986fd3c
...
@@ -21,7 +21,8 @@ class DummyLM(LM):
...
@@ -21,7 +21,8 @@ class DummyLM(LM):
def
greedy_until
(
self
,
requests
):
def
greedy_until
(
self
,
requests
):
res
=
[]
res
=
[]
for
_
in
requests
:
for
ctx
,
_
in
requests
:
res
.
append
(
"lol"
)
res
.
append
(
"lol"
)
assert
ctx
.
strip
()
!=
''
return
res
return
res
lm_eval/tasks/__init__.py
View file @
d986fd3c
...
@@ -19,7 +19,7 @@ from . import naturalqs
...
@@ -19,7 +19,7 @@ from . import naturalqs
from
.
import
sat
from
.
import
sat
from
.
import
arithmetic
from
.
import
arithmetic
from
.
import
lambada
from
.
import
lambada
from
.
import
race
from
.
import
race
from
.
import
piqa
from
.
import
piqa
from
.
import
triviaqa
from
.
import
triviaqa
from
.
import
pubmedqa
from
.
import
pubmedqa
...
@@ -30,6 +30,7 @@ from . import translation
...
@@ -30,6 +30,7 @@ from . import translation
from
.
import
headqa
from
.
import
headqa
from
.
import
mathqa
from
.
import
mathqa
from
.
import
drop
from
.
import
drop
from
.
import
unscramble
########################################
########################################
# Translation tasks
# Translation tasks
...
@@ -132,6 +133,13 @@ TASK_REGISTRY = {
...
@@ -132,6 +133,13 @@ TASK_REGISTRY = {
**
translation
.
create_tasks_from_benchmarks
(
gpt3_translation_benchmarks
),
**
translation
.
create_tasks_from_benchmarks
(
gpt3_translation_benchmarks
),
# chef's selection, mostly wmt20
# chef's selection, mostly wmt20
**
translation
.
create_tasks_from_benchmarks
(
selected_translation_benchmarks
),
**
translation
.
create_tasks_from_benchmarks
(
selected_translation_benchmarks
),
# Word Scrambling and Manipulation Tasks
"anagrams1"
:
unscramble
.
Anagrams1
,
"anagrams2"
:
unscramble
.
Anagrams2
,
"cycle_letters"
:
unscramble
.
CycleLetters
,
"random_insertion"
:
unscramble
.
RandomInsertion
,
"reversed_words"
:
unscramble
.
ReversedWords
,
}
}
...
...
lm_eval/tasks/drop.py
View file @
d986fd3c
import
json
import
json
import
numpy
as
np
import
numpy
as
np
import
re
import
re
import
transformers.data.metrics.squad_metrics
as
squad_metrics
import
string
from
best_download
import
download_file
from
best_download
import
download_file
from
scipy.optimize
import
linear_sum_assignment
from
scipy.optimize
import
linear_sum_assignment
from
lm_eval.base
import
Task
,
rf
from
lm_eval.base
import
Task
,
rf
...
@@ -16,15 +16,18 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r
...
@@ -16,15 +16,18 @@ https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_r
class
DROP
(
Task
):
class
DROP
(
Task
):
DATAFOLDER
=
Path
(
"data/drop"
)
DATASET_PATH
=
Path
(
"data/drop"
)
URL
=
"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
def
download
(
self
):
def
download
(
self
):
if
self
.
DATAFOLDER
.
exists
():
return
if
self
.
DATASET_PATH
.
exists
():
Path
.
mkdir
(
self
.
DATAFOLDER
)
return
download_file
(
self
.
URL
,
to
=
str
(
self
.
DATAFOLDER
/
"drop_dataset.zip"
))
Path
.
mkdir
(
self
.
DATASET_PATH
)
with
ZipFile
(
self
.
DATAFOLDER
/
"drop_dataset.zip"
,
"r"
)
as
zip
:
url
=
"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
zip
.
extractall
(
self
.
DATAFOLDER
)
checksum
=
"39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"
zip_path
=
self
.
DATASET_PATH
/
"drop_dataset.zip"
download_file
(
url
,
str
(
zip_path
),
checksum
)
with
ZipFile
(
zip_path
,
"r"
)
as
zip
:
zip
.
extractall
(
self
.
DATASET_PATH
)
def
has_training_docs
(
self
):
def
has_training_docs
(
self
):
return
True
return
True
...
@@ -61,16 +64,13 @@ class DROP(Task):
...
@@ -61,16 +64,13 @@ class DROP(Task):
answers
[
"date"
][
"year"
]]).
strip
()]
answers
[
"date"
][
"year"
]]).
strip
()]
def
training_docs
(
self
):
def
training_docs
(
self
):
docs
=
json
.
load
(
open
(
self
.
DATA
FOLDER
/
"drop_dataset"
/
"drop_dataset_train.json"
))
docs
=
json
.
load
(
open
(
self
.
DATA
SET_PATH
/
"drop_dataset"
/
"drop_dataset_train.json"
))
return
self
.
_load_docs
([
docs
[
k
]
for
k
in
docs
.
keys
()])
return
self
.
_load_docs
([
docs
[
k
]
for
k
in
docs
.
keys
()])
def
validation_docs
(
self
):
def
validation_docs
(
self
):
docs
=
json
.
load
(
open
(
self
.
DATA
FOLDER
/
"drop_dataset"
/
"drop_dataset_dev.json"
))
docs
=
json
.
load
(
open
(
self
.
DATA
SET_PATH
/
"drop_dataset"
/
"drop_dataset_dev.json"
))
return
self
.
_load_docs
([
docs
[
k
]
for
k
in
docs
.
keys
()])
return
self
.
_load_docs
([
docs
[
k
]
for
k
in
docs
.
keys
()])
def
test_docs
(
self
):
pass
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
f
"Passage:
{
doc
[
'passage'
]
}
\n
Question:
{
doc
[
'question'
]
}
\n
Answer:"
return
f
"Passage:
{
doc
[
'passage'
]
}
\n
Question:
{
doc
[
'question'
]
}
\n
Answer:"
...
@@ -103,44 +103,55 @@ class DROP(Task):
...
@@ -103,44 +103,55 @@ class DROP(Task):
:param results:
:param results:
The results of the requests created in construct_requests.
The results of the requests created in construct_requests.
"""
"""
golds
,
preds
=
doc
[
"answers"
],
results
preds
,
golds
=
results
,
doc
[
"answers"
]
exact_match
=
self
.
_exact_match
(
golds
,
preds
)
exact_match
,
f1_score
=
self
.
get_metrics
(
preds
,
golds
)
f1_score
=
self
.
_f1_score
(
golds
,
preds
)
return
{
return
{
"em"
:
exact_match
,
"em"
:
exact_match
,
"f1"
:
f1_score
"f1"
:
f1_score
}
}
def
_exact_match
(
self
,
golds
,
preds
):
def
get_metrics
(
self
,
preds
,
golds
):
""" Returns the exact match of normalized gold answers and predictions. """
exact_match
=
self
.
_exact_match
(
preds
,
golds
)
normalized_golds
=
set
([
self
.
_normalize
(
gold
)
for
gold
in
golds
])
f1_score
=
self
.
_f1_score
(
preds
,
golds
)
normalized_preds
=
set
([
self
.
_normalize
(
pred
)
for
pred
in
preds
])
return
exact_match
,
f1_score
return
int
(
normalized_golds
==
normalized_preds
)
def
_f1_score
(
self
,
golds
,
preds
):
def
_exact_match
(
self
,
preds
,
golds
):
"""Returns the average F1-score over normalized gold answers and predictions. """
""" Returns the exact match of normalized gold answers and predictions. """
gold_bags
=
self
.
_answer_to_bags
(
golds
)
normalized_preds
=
[
self
.
_normalize
(
pred
)
for
pred
in
preds
]
normalized_golds
=
[
self
.
_normalize
(
gold
)
for
gold
in
golds
]
is_equal_sets
=
set
(
normalized_preds
)
==
set
(
normalized_golds
)
is_equal_length
=
len
(
normalized_preds
)
==
len
(
normalized_golds
)
return
int
(
is_equal_sets
and
is_equal_length
)
def
_f1_score
(
self
,
preds
,
golds
):
"""Returns the average F1-score over normalized gold answers and predictions.
From Section 5 of Dua et al. "DROP:...":
"When an answer has multiple spans, we first perform a one-to-one
alignment greedily based on bag-of-word overlap on the set of spans
and then compute average F1 over each span."
"""
pred_bags
=
self
.
_answer_to_bags
(
preds
)
pred_bags
=
self
.
_answer_to_bags
(
preds
)
f1_per_bag
=
self
.
_align_bags
(
gold_bags
,
pred_bags
)
gold_bags
=
self
.
_answer_to_bags
(
golds
)
f1_per_bag
=
self
.
_align_bags
(
pred_bags
,
gold_bags
)
return
np
.
mean
(
f1_per_bag
)
return
np
.
mean
(
f1_per_bag
)
def
_answer_to_bags
(
self
,
answers
):
def
_answer_to_bags
(
self
,
answers
):
return
[
set
(
self
.
_normalize
(
answer
).
split
())
for
answer
in
answers
]
return
[
set
(
self
.
_normalize
(
answer
).
split
())
for
answer
in
answers
]
def
_align_bags
(
self
,
gol
d_bags
,
pre
d_bags
):
def
_align_bags
(
self
,
pre
d_bags
,
gol
d_bags
):
""" Returns the max metric value over all the answers. """
""" Returns the max metric value over all the answers. """
scores
=
np
.
zeros
([
len
(
gold_bags
),
len
(
pred_bags
)])
scores
=
np
.
zeros
([
len
(
gold_bags
),
len
(
pred_bags
)])
for
gold_index
,
gold_bag
in
enumerate
(
gold_bags
):
for
gold_index
,
gold_bag
in
enumerate
(
gold_bags
):
for
pred_index
,
pred_bag
in
enumerate
(
pred_bags
):
for
pred_index
,
pred_bag
in
enumerate
(
pred_bags
):
if
self
.
_is_number_match
(
gol
d_bag
,
pre
d_bag
):
if
self
.
_is_number_match
(
pre
d_bag
,
gol
d_bag
):
scores
[
gold_index
,
pred_index
]
=
self
.
_bag_f1
(
gol
d_bag
,
pre
d_bag
)
scores
[
gold_index
,
pred_index
]
=
self
.
_bag_f1
(
pre
d_bag
,
gol
d_bag
)
row_ind
,
col_ind
=
linear_sum_assignment
(
-
scores
)
row_ind
,
col_ind
=
linear_sum_assignment
(
-
scores
)
max_scores
=
np
.
zeros
([
max
(
len
(
gold_bags
),
len
(
pred_bags
))])
max_scores
=
np
.
zeros
([
max
(
len
(
gold_bags
),
len
(
pred_bags
))])
for
row
,
column
in
zip
(
row_ind
,
col_ind
):
for
row
,
column
in
zip
(
row_ind
,
col_ind
):
max_scores
[
row
]
=
max
(
max_scores
[
row
],
scores
[
row
,
column
])
max_scores
[
row
]
=
max
(
max_scores
[
row
],
scores
[
row
,
column
])
return
max_scores
return
max_scores
def
_bag_f1
(
self
,
gol
d_bag
,
pre
d_bag
):
def
_bag_f1
(
self
,
pre
d_bag
,
gol
d_bag
):
intersection
=
len
(
gold_bag
.
intersection
(
pred_bag
))
intersection
=
len
(
gold_bag
.
intersection
(
pred_bag
))
if
intersection
==
0
:
if
intersection
==
0
:
return
0.0
return
0.0
...
@@ -149,15 +160,45 @@ class DROP(Task):
...
@@ -149,15 +160,45 @@ class DROP(Task):
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
return
f1
def
_is_number_match
(
self
,
gold_bag
,
pred_bag
):
def
_is_number_match
(
self
,
pred_bag
,
gold_bag
):
gold_numbers
=
set
(
filter
(
lambda
s
:
s
.
isnumeric
(),
list
(
gold_bag
)))
pred_numbers
=
set
([
word
for
word
in
pred_bag
if
self
.
_is_number
(
word
)])
pred_numbers
=
set
(
filter
(
lambda
s
:
s
.
isnumeric
(),
list
(
pred_bag
)))
gold_numbers
=
set
([
word
for
word
in
gold_bag
if
self
.
_is_number
(
word
)])
return
(
not
gold_numbers
)
or
gold_numbers
.
intersection
(
pred_numbers
)
if
(
not
gold_numbers
)
or
gold_numbers
.
intersection
(
pred_numbers
):
return
True
return
False
def
_is_number
(
self
,
text
):
try
:
float
(
text
)
return
True
except
ValueError
:
return
False
def
_normalize
(
self
,
answer
):
def
_normalize
(
self
,
answer
):
def
remove_articles
(
text
):
regex
=
re
.
compile
(
r
"\b(a|an|the)\b"
,
re
.
UNICODE
)
return
re
.
sub
(
regex
,
" "
,
text
)
def
white_space_fix
(
text
):
return
" "
.
join
(
text
.
split
())
def
remove_punc
(
text
):
exclude
=
set
(
string
.
punctuation
)
if
not
self
.
_is_number
(
text
):
return
""
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
else
:
return
text
def
fix_number
(
text
):
return
str
(
float
(
text
))
if
self
.
_is_number
(
text
)
else
text
def
tokenize
(
text
):
def
tokenize
(
text
):
return
re
.
split
(
" |-"
,
text
)
return
re
.
split
(
" |-"
,
text
)
tokens
=
[
squad_metrics
.
normalize_answer
(
token
)
for
token
in
tokenize
(
answer
)]
tokens
=
[
white_space_fix
(
remove_articles
(
fix_number
(
remove_punc
(
token
.
lower
()))))
for
token
in
tokenize
(
answer
)
]
tokens
=
[
token
for
token
in
tokens
if
token
.
strip
()]
tokens
=
[
token
for
token
in
tokens
if
token
.
strip
()]
normalized
=
" "
.
join
(
tokens
).
strip
()
normalized
=
" "
.
join
(
tokens
).
strip
()
return
normalized
return
normalized
...
...
lm_eval/tasks/translation.py
View file @
d986fd3c
...
@@ -145,12 +145,6 @@ class GeneralTranslationTask(Task):
...
@@ -145,12 +145,6 @@ class GeneralTranslationTask(Task):
tar_lang
=
code_to_language
(
language_codes
[
1
])
tar_lang
=
code_to_language
(
language_codes
[
1
])
return
f
"Translate these
{
src_lang
}
phrases to
{
tar_lang
}
."
return
f
"Translate these
{
src_lang
}
phrases to
{
tar_lang
}
."
# TODO This should be something like
# French: {src_line}
# English: {ref_line}
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
):
return
""
def
__str__
(
self
):
def
__str__
(
self
):
language_codes
=
self
.
sacrebleu_language_pair
.
split
(
"-"
)
language_codes
=
self
.
sacrebleu_language_pair
.
split
(
"-"
)
src_lang
=
code_to_language
(
language_codes
[
0
])
src_lang
=
code_to_language
(
language_codes
[
0
])
...
...
lm_eval/tasks/unscramble.py
0 → 100644
View file @
d986fd3c
import
gzip
import
json
import
random
import
shutil
from
pathlib
import
Path
from
best_download
import
download_file
from
lm_eval.base
import
Task
,
rf
from
lm_eval.metrics
import
mean
def
extract_gzip
(
gz
,
to
):
with
gzip
.
open
(
gz
,
'rb'
)
as
fin
:
with
open
(
to
,
'wb'
)
as
fout
:
shutil
.
copyfileobj
(
fin
,
fout
)
class
WordUnscrambleTask
(
Task
):
BASE_PATH
=
Path
(
"data/unscramble"
)
FILENAME
=
None
CHECKSUM
=
None
# SHA256 Checksum.
def
__init__
(
self
):
super
().
__init__
()
def
download
(
self
):
if
not
self
.
BASE_PATH
.
exists
():
Path
.
mkdir
(
self
.
BASE_PATH
)
file
=
self
.
BASE_PATH
/
self
.
FILENAME
if
not
file
.
exists
():
rawfile
=
file
.
parent
/
(
file
.
name
+
".gz"
)
base_url
=
"https://raw.githubusercontent.com/openai/gpt-3/master/data"
download_file
(
f
"
{
base_url
}
/
{
self
.
FILENAME
}
.gz"
,
str
(
rawfile
),
self
.
CHECKSUM
)
extract_gzip
(
gz
=
rawfile
,
to
=
file
)
def
has_training_docs
(
self
):
return
False
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
False
def
validation_docs
(
self
):
file
=
self
.
BASE_PATH
/
self
.
FILENAME
return
(
json
.
loads
(
line
)
for
line
in
open
(
file
).
read
().
splitlines
())
def
fewshot_description
(
self
):
return
"Please unscramble the letters into a word, and write that word:"
def
doc_to_text
(
self
,
doc
):
return
doc
[
"context"
]
def
doc_to_target
(
self
,
doc
):
return
doc
[
"completion"
]
def
construct_requests
(
self
,
doc
,
ctx
):
completion
=
rf
.
greedy_until
(
ctx
,
[
"
\n
"
])
return
completion
def
process_results
(
self
,
doc
,
results
):
pred
=
results
[
0
]
gold
=
doc
[
"completion"
]
return
{
"acc"
:
int
(
pred
==
gold
)
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
class
Anagrams1
(
WordUnscrambleTask
):
FILENAME
=
"mid_word_1_anagrams.jsonl"
CHECKSUM
=
"6768a86896083199de4815d4964cb2f6f1046476cfd80c2a562784f182905979"
class
Anagrams2
(
WordUnscrambleTask
):
FILENAME
=
"mid_word_2_anagrams.jsonl"
CHECKSUM
=
"c3d839d09a7954b78a27cd2cd75d4ed0488656c56ef4dbd741a005343826cb01"
class
CycleLetters
(
WordUnscrambleTask
):
FILENAME
=
"cycle_letters_in_word.jsonl"
CHECKSUM
=
"1689c9002bb8c5988bf5f05e977c9db92f57932c1b5a38998c29ac0dd71e1d42"
class
RandomInsertion
(
WordUnscrambleTask
):
FILENAME
=
"random_insertion_in_word.jsonl"
CHECKSUM
=
"72e65d83da53d15752ee0c47379509de149ddbad32d61184e5991df29616b78a"
class
ReversedWords
(
WordUnscrambleTask
):
FILENAME
=
"reversed_words.jsonl"
CHECKSUM
=
"133a08f875cd6c1ef8608a3233571a773881cc27b1c707de738cc6543439332a"
scripts/fewshot_description_experiment.py
0 → 100644
View file @
d986fd3c
import
argparse
import
json
import
numpy
as
np
import
random
import
itertools
import
collections
import
logging
from
lm_eval
import
models
,
tasks
,
evaluator
,
base
logging
.
getLogger
(
"openai"
).
setLevel
(
logging
.
WARNING
)
fewshot_descriptions
=
[
"foo"
,
"bar"
]
task
=
"lambada"
num_fewshot
=
0
model
=
"gpt2"
model_args
=
""
limit
=
None
no_cache
=
False
class
CustomDescTask
:
def
__init__
(
self
,
task
,
desc
):
self
.
task
=
task
self
.
desc
=
desc
def
fewshot_description
():
return
self
.
desc
self
.
task
.
fewshot_description
=
fewshot_description
def
__getattr__
(
self
,
attr
):
return
getattr
(
self
.
task
,
attr
)
def
main
():
random
.
seed
(
42
)
np
.
random
.
seed
(
42
)
lm
=
models
.
get_model
(
model
).
create_from_arg_string
(
model_args
)
if
limit
:
print
(
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
if
not
no_cache
:
lm
=
base
.
CachingLM
(
lm
,
'lm_cache/'
+
model
+
'_'
+
model_args
.
replace
(
'='
,
'-'
).
replace
(
','
,
'_'
)
+
'.db'
)
task_dict
=
tasks
.
get_task_dict
([
task
])
for
desc
in
fewshot_descriptions
:
custom_task_dict
=
{
k
:
CustomDescTask
(
v
,
desc
)
for
k
,
v
in
task_dict
.
items
()}
results
=
evaluator
.
evaluate
(
lm
,
custom_task_dict
,
True
,
num_fewshot
,
limit
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
)
print
(
'Description:'
,
desc
)
print
(
dumped
)
# MAKE TABLE
from
pytablewriter
import
MarkdownTableWriter
writer
=
MarkdownTableWriter
()
writer
.
headers
=
[
"Task"
,
"Metric"
,
"Value"
]
values
=
[]
for
k
,
dic
in
results
.
items
():
for
m
,
v
in
dic
.
items
():
values
.
append
([
k
,
m
,
'%.4f'
%
v
])
k
=
""
writer
.
value_matrix
=
values
print
(
writer
.
dumps
())
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment