Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cf80f340
Commit
cf80f340
authored
Sep 07, 2020
by
Jason Phang
Browse files
glue tasks
parent
c2aaa501
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
406 additions
and
50 deletions
+406
-50
base.py
base.py
+8
-6
hf.py
hf.py
+1
-1
main.py
main.py
+39
-3
tasks/__init__.py
tasks/__init__.py
+3
-0
tasks/common.py
tasks/common.py
+7
-0
tasks/glue.py
tasks/glue.py
+312
-40
tasks/superglue.py
tasks/superglue.py
+36
-0
No files found.
base.py
View file @
cf80f340
...
...
@@ -103,14 +103,16 @@ class Dataset(abc.ABC):
"""
pass
def
fewshot_
prefix
(
self
):
def
fewshot_
description
(
self
):
return
""
def
fewshot_context
(
self
,
doc
,
k
):
prefix
=
self
.
fewshot_prefix
()
labeled_examples
=
"
\n\n
"
.
join
([
self
.
doc_to_text
(
doc
)
for
doc
in
self
.
fewshot_examples
(
k
)])
example
=
self
.
doc_to_text
(
doc
,
include_target
=
False
)
return
prefix
+
labeled_examples
+
example
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
):
description
=
(
self
.
fewshot_description
()
+
"
\n\n
"
)
if
provide_description
else
""
labeled_examples
=
"
\n\n
"
.
join
(
map
(
self
.
doc_to_text
,
self
.
fewshot_examples
(
k
=
num_fewshot
))
)
+
"
\n\n
"
example
=
self
.
doc_to_text
(
doc
,
include_target
=
False
).
strip
()
return
description
+
labeled_examples
+
example
class
Registry
:
...
...
hf.py
View file @
cf80f340
...
...
@@ -23,7 +23,7 @@ class BoolQ(base.Dataset):
def
test_docs
(
self
):
return
[]
def
fewshot_
description
(
self
):
def
fewshot_
prefix
(
self
):
return
"Read the following passages and answer each question with a yes or a no."
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
...
...
main.py
View file @
cf80f340
from
models.gpt2
import
GPT2LM
import
argparse
import
json
lm
=
GPT2LM
()
import
models
import
tasks
print
(
lm
.
generate
(
'1 + 1 = 2.
\n
3 + 5 = 8.
\n
4 + 9 = 13.
\n
4 + 3 = 7.
\n
2 + 3 ='
,
'.'
))
\ No newline at end of file
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--model'
,
required
=
True
)
parser
.
add_argument
(
'--model_args'
,
default
=
""
)
parser
.
add_argument
(
'--tasks'
,
default
=
"all_tasks"
)
parser
.
add_argument
(
'--provide_description'
,
action
=
"store_true"
)
parser
.
add_argument
(
'--new_fewshot'
,
action
=
"store_true"
)
return
parser
.
parse_args
()
def
main
():
args
=
parse_args
()
model
=
models
.
get_model
(
args
.
model
).
create_from_arg_string
(
args
.
model_args
)
if
args
.
tasks
==
"all_tasks"
:
task_names
=
tasks
.
ALL_TASKS
else
:
task_names
=
args
.
tasks
.
split
(
","
)
task_list
=
{
task_name
:
tasks
.
get_task
(
task_name
)()
for
task_name
in
task_names
}
results
=
{}
for
task_name
,
task
in
task_list
:
if
not
task
.
has_validation_docs
():
continue
result
=
task
.
evaluate
(
docs
=
task
.
validation_docs
(),
provide_description
=
args
.
provide_description
,
num_fewshot
=
args
.
new_fewshot
,
)
results
[
task_name
]
=
result
print
(
json
.
dumps
(
results
,
indent
=
2
))
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
tasks/__init__.py
View file @
cf80f340
...
...
@@ -16,5 +16,8 @@ for file in os.listdir(tasks_dir):
module
=
importlib
.
import_module
(
'lm_evaluation_harness.tasks.'
+
module_name
)
ALL_TASKS
=
sorted
(
list
(
TASK_REGISTRY
.
registry
))
def
get_task
(
model_name
):
return
TASK_REGISTRY
.
registry
[
model_name
]
tasks/common.py
View file @
cf80f340
...
...
@@ -31,3 +31,10 @@ def simple_accuracy_metric(preds, golds):
"minor"
:
{
"acc"
:
acc
},
"higher_is_better"
:
True
,
}
def
yesno
(
x
):
if
x
:
return
'yes'
else
:
return
'no'
tasks/glue.py
View file @
cf80f340
import
nlp
import
numpy
as
np
import
random
from
scipy.stats
import
pearsonr
,
spearmanr
from
sklearn.metrics
import
f1_score
,
matthews_corrcoef
from
.
common
import
NLP_TASK
,
simple_accuracy_metric
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
NLP_TASK
,
simple_accuracy_metric
,
yesno
from
.
import
TASK_REGISTRY
def
get_accuracy_and_f1
(
preds
,
golds
):
golds
=
np
.
array
(
golds
)
preds
=
np
.
array
(
preds
)
acc
=
float
((
preds
==
golds
).
mean
())
f1
=
float
(
f1_score
(
y_true
=
golds
,
y_pred
=
preds
))
minor
=
{
"acc"
:
acc
,
"f1"
:
f1
,
"acc_and_f1"
:
(
acc
+
f1
)
/
2
,
}
return
{
"major"
:
minor
[
"acc_and_f1"
],
"minor"
:
minor
,
"higher_is_better"
:
True
,
}
@
TASK_REGISTRY
.
register
(
"cola"
)
class
CoLA
(
NLP_TASK
):
NLP_PATH
=
"glue"
NLP_NAME
=
"cola"
def
has_training_docs
(
self
):
return
True
...
...
@@ -17,27 +37,25 @@ class CoLA(NLP_TASK):
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Does this sentence make sense?:
\t
True or False?"
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
text
=
"Does this sentence make sense?:
\t
True or False?"
\
"
\n
sentence:{}
\n
Answer: "
.
format
(
doc
[
"sentence"
])
text
=
"
\n
Sentence:{}
\n
Answer: "
.
format
(
doc
[
"sentence"
])
if
include_target
:
text
+=
" {}"
.
format
({
1
:
"True"
,
0
:
"False"
}[
doc
[
"label"
]])
return
text
def
evaluate
(
self
,
docs
,
lm
,
k
=
0
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
docs
:
word
=
lm
.
generate
(
context
=
self
.
fewshot_context
(
doc
=
doc
,
k
=
k
),
max_gen_length
=
1
,
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
if
word
.
strip
()
==
"True"
:
preds
.
append
(
1
)
elif
word
.
strip
()
==
"False"
:
preds
.
append
(
0
)
else
:
preds
.
append
(
-
1
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' True'
)
>
lm
.
loglikelihood
(
ctx
,
' False'
))
golds
=
np
.
array
(
golds
)
preds
=
np
.
array
(
preds
)
mcc
=
float
(
matthews_corrcoef
(
y_true
=
golds
,
y_pred
=
preds
))
...
...
@@ -50,6 +68,9 @@ class CoLA(NLP_TASK):
@
TASK_REGISTRY
.
register
(
"mnli"
)
class
MNLI
(
NLP_TASK
):
NLP_PATH
=
"glue"
NLP_NAME
=
"mnli"
def
has_training_docs
(
self
):
return
True
...
...
@@ -69,8 +90,8 @@ class MNLI(NLP_TASK):
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
text
=
"{}
\n
question:
\t
{}
\t
True, False or Neither?
\n
answer:"
.
format
(
doc
[
"
sentence1
"
],
doc
[
"
sentence2
"
],
doc
[
"
premise
"
],
doc
[
"
hypothesis
"
],
)
if
include_target
:
# True = entailment
...
...
@@ -79,26 +100,65 @@ class MNLI(NLP_TASK):
text
+=
" {}"
.
format
({
0
:
"True"
,
1
:
"Neither"
,
2
:
"False"
}[
doc
[
"label"
]])
return
text
def
evaluate
(
self
,
docs
,
lm
,
k
=
0
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
docs
:
word
=
lm
.
generate
(
context
=
self
.
fewshot_context
(
doc
=
doc
,
k
=
k
),
max_gen_length
=
1
,
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
if
word
.
strip
()
==
"True"
:
preds
.
append
(
1
)
elif
word
.
strip
()
==
"False"
:
preds
.
append
(
0
)
else
:
preds
.
append
(
-
1
)
probs
=
np
.
array
([
self
.
lm
.
loglikelihood
(
ctx
,
' True'
),
self
.
lm
.
loglikelihood
(
ctx
,
' Neither'
),
self
.
lm
.
loglikelihood
(
ctx
,
' False'
),
])
preds
.
append
(
np
.
argmax
(
probs
)
)
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
@
TASK_REGISTRY
.
register
(
"mrpc"
)
class
MRPC
(
NLP_TASK
):
NLP_PATH
=
"glue"
NLP_NAME
=
"mrpc"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Indicate if both sentences mean the same thing."
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
text
=
"sentence 1:
\t
{}
\n
sentence 2:
\t
{}
\n
answer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence2"
],
)
if
include_target
:
text
+=
" {}"
.
format
(
yesno
(
doc
[
"label"
]))
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' yes'
)
>
lm
.
loglikelihood
(
ctx
,
' no'
))
return
get_accuracy_and_f1
(
preds
=
preds
,
golds
=
golds
)
@
TASK_REGISTRY
.
register
(
"rte"
)
class
RTE
(
NLP_TASK
):
NLP_PATH
=
"glue"
NLP_NAME
=
"rte"
...
...
@@ -120,18 +180,230 @@ class RTE(NLP_TASK):
text
+=
" {}"
.
format
({
1
:
"True"
,
0
:
"False"
}[
doc
[
"label"
]])
return
text
def
evaluate
(
self
,
docs
,
lm
,
k
=
0
):
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' True'
)
>
lm
.
loglikelihood
(
ctx
,
' False'
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
@
TASK_REGISTRY
.
register
(
"qnli"
)
class
QNLI
(
NLP_TASK
):
NLP_PATH
=
"glue"
NLP_NAME
=
"qnli"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
text
=
"{}
\n
question:
\t
{}
\t
True or False?
\n
answer:"
.
format
(
doc
[
"question"
],
doc
[
"sentence"
],
)
if
include_target
:
# True = entailment
# False = not entailment
text
+=
" {}"
.
format
({
0
:
"True"
,
1
:
"False"
}[
doc
[
"label"
]])
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
self
.
lm
.
loglikelihood
(
ctx
,
' False'
)
>
self
.
lm
.
loglikelihood
(
ctx
,
' True'
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
@
TASK_REGISTRY
.
register
(
"qqp"
)
class
QQP
(
NLP_TASK
):
NLP_PATH
=
"glue"
NLP_NAME
=
"qqp"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Indicate if both sentences mean the same thing."
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
text
=
"question 1:
\t
{}
\n
question 2:
\t
{}
\n
answer:"
.
format
(
doc
[
"question1"
],
doc
[
"question2"
],
)
if
include_target
:
text
+=
" {}"
.
format
(
yesno
(
doc
[
"label"
]))
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' yes'
)
>
lm
.
loglikelihood
(
ctx
,
' no'
))
return
get_accuracy_and_f1
(
preds
=
preds
,
golds
=
golds
)
@
TASK_REGISTRY
.
register
(
"stsb"
)
class
STSB
(
NLP_TASK
):
NLP_PATH
=
"glue"
NLP_NAME
=
"stsb"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Indicate if both sentences mean the same thing from a scale of 0-5, "
\
"where 5 means identical and 0 means unrelated."
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
text
=
"sentence 1:
\t
{}
\n
sentence 2:
\t
{}
\n
answer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence2"
],
)
if
include_target
:
text
+=
" {}"
.
format
(
yesno
(
doc
[
"label"
]))
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
docs
:
word
=
lm
.
generate
(
context
=
self
.
fewshot_context
(
doc
=
doc
,
k
=
k
),
max_gen_length
=
1
,
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
if
word
.
strip
()
==
"True"
:
preds
.
append
(
1
)
el
if
word
.
st
ri
p
()
==
"False"
:
pred
s
.
append
(
0
)
output
=
lm
.
generate
(
context
=
ctx
,
max_gen_length
=
5
).
strip
()
first_element
=
output
.
split
()[
0
]
if
first_element
.
isnume
ri
c
():
pred
=
max
(
min
(
float
(
first_element
),
5.0
),
0.
0
)
else
:
preds
.
append
(
-
1
)
pred
=
2.5
preds
.
append
(
pred
)
pearson_corr
=
float
(
pearsonr
(
preds
,
golds
)[
0
])
spearman_corr
=
float
(
spearmanr
(
preds
,
golds
)[
0
])
minor
=
{
"pearson"
:
pearson_corr
,
"spearmanr"
:
spearman_corr
,
"corr"
:
(
pearson_corr
+
spearman_corr
)
/
2
,
}
return
{
"major"
:
minor
[
"corr"
],
"minor"
:
minor
,
"higher_is_better"
:
True
,
}
@
TASK_REGISTRY
.
register
(
"sst"
)
class
SST
(
NLP_TASK
):
NLP_PATH
=
"glue"
NLP_NAME
=
"sst2"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Indicate if each sentence is Positive or Negative."
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
text
=
"sentence:
\t
{}
\t\n
answer:"
.
format
(
doc
[
"sentence"
],
)
if
include_target
:
text
+=
" {}"
.
format
({
1
:
"Positive"
,
0
:
"Negative"
}[
doc
[
"label"
]])
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' Positive'
)
>
lm
.
loglikelihood
(
ctx
,
' Negative'
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
@
TASK_REGISTRY
.
register
(
"wnli"
)
class
WNLI
(
NLP_TASK
):
NLP_PATH
=
"glue"
NLP_NAME
=
"wnli"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
text
=
"{}
\n
question:
\t
{}
\t
True, False or Neither?
\n
answer:"
.
format
(
doc
[
"premise"
],
doc
[
"hypothesis"
],
)
if
include_target
:
# True = entailment
# False = contradiction
# Neither = neutral
text
+=
" {}"
.
format
({
0
:
"True"
,
1
:
"Neither"
,
2
:
"False"
}[
doc
[
"label"
]])
return
text
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
probs
=
np
.
array
([
self
.
lm
.
loglikelihood
(
ctx
,
' True'
),
self
.
lm
.
loglikelihood
(
ctx
,
' Neither'
),
self
.
lm
.
loglikelihood
(
ctx
,
' False'
),
])
preds
.
append
(
np
.
argmax
(
probs
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
tasks/superglue.py
0 → 100644
View file @
cf80f340
from
.
common
import
NLP_TASK
,
simple_accuracy_metric
,
yesno
from
.
import
TASK_REGISTRY
@
TASK_REGISTRY
.
register
(
"boolq"
)
class
BoolQ
(
NLP_TASK
):
NLP_PATH
=
"superglue"
NLP_NAME
=
"boolq"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Read the following passages and answer each question with a yes or a no."
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
return
f
"
{
doc
[
'passage'
]
}
\n
question:
{
doc
[
'question'
]
}
\n
answer: "
\
+
(
yesno
(
doc
[
'answer'
])
if
include_target
else
""
)
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
golds
=
[
doc
[
"answer"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
docs
:
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' yes'
)
>
lm
.
loglikelihood
(
ctx
,
' no'
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment