Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
36467c0e
Commit
36467c0e
authored
Jan 21, 2021
by
Jonathan Tow
Browse files
Adopt new framework for `glue`
parent
4c8d22db
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
264 additions
and
231 deletions
+264
-231
lm_eval/base.py
lm_eval/base.py
+11
-7
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+253
-224
No files found.
lm_eval/base.py
View file @
36467c0e
import
abc
import
random
import
collections
import
numpy
as
np
from
sklearn.metrics
import
precision_recall_fscore_support
as
score
import
sklearn
class
LM
(
abc
.
ABC
):
@
abc
.
abstractmethod
...
...
@@ -177,15 +176,23 @@ class Dataset(abc.ABC):
return
description
+
labeled_examples
+
example
def
mean
(
arr
):
return
sum
(
arr
)
/
len
(
arr
)
def
median
(
arr
):
return
arr
[
len
(
arr
)
//
2
]
def
matthews_corrcoef
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
return
sklearn
.
metrics
.
matthews_corrcoef
(
golds
,
preds
)
def
f1_score
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
precision
,
recall
,
fscore
,
support
=
score
(
golds
,
preds
)
fscore
=
sklearn
.
metrics
.
f1_
score
(
golds
,
preds
)
return
max
(
fscore
)
def
acc_all
(
items
):
...
...
@@ -205,9 +212,6 @@ def acc_all(items):
acc
=
np
.
mean
([
int
(
all
(
x
))
for
x
in
question_scoring_dict
.
values
()])
return
acc
def
median
(
arr
):
return
arr
[
len
(
arr
)
//
2
]
req_ret_lens
=
{
'loglikelihood'
:
2
}
...
...
lm_eval/tasks/glue.py
View file @
36467c0e
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
,
f1_score
,
matthews_corrcoef
from
scipy.stats
import
pearsonr
,
spearmanr
from
sklearn.metrics
import
f1_score
,
matthews_corrcoef
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
HFTask
,
simple_accuracy_metric
,
yesno
def
get_accuracy_and_f1
(
preds
,
golds
):
golds
=
np
.
array
(
golds
)
preds
=
np
.
array
(
preds
)
acc
=
float
((
preds
==
golds
).
mean
())
f1
=
float
(
f1_score
(
y_true
=
golds
,
y_pred
=
preds
))
minor
=
{
"acc"
:
acc
,
"f1"
:
f1
,
"acc_and_f1"
:
(
acc
+
f1
)
/
2
,
}
return
{
"major"
:
minor
[
"acc_and_f1"
],
"minor"
:
minor
,
"higher_is_better"
:
True
,
}
from
.
common
import
HFTask
,
yesno
# Single-Sentence Tasks
class
CoLA
(
HFTask
):
DATASET_PATH
=
"glue"
DATASET_NAME
=
"cola"
def
has_training_docs
(
self
):
return
True
...
...
@@ -45,31 +32,80 @@ class CoLA(HFTask):
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
({
1
:
"True"
,
0
:
"False"
}[
doc
[
"label"
]])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
def
construct_requests
(
self
,
doc
,
ctx
):
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
" True"
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
" False"
)
return
ll_true
,
ll_false
# TODO: Implement evaluation code using new framework
def
process_results
(
self
,
doc
,
results
):
ll_true
,
ll_false
=
results
pred
=
ll_true
>
ll_false
gold
=
doc
[
"label"
]
return
{
"mcc"
:
(
gold
,
pred
)
}
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' True'
)
>
lm
.
loglikelihood
(
ctx
,
' False'
))
golds
=
np
.
array
(
golds
)
preds
=
np
.
array
(
preds
)
mcc
=
float
(
matthews_corrcoef
(
y_true
=
golds
,
y_pred
=
preds
))
def
higher_is_better
(
self
):
return
{
"major"
:
mcc
,
"minor"
:
{
"mcc"
:
mcc
},
"higher_is_better"
:
True
,
"mcc"
:
True
}
def
aggregation
(
self
):
return
{
"mcc"
:
matthews_corrcoef
}
class
SST
(
HFTask
):
DATASET_PATH
=
"glue"
DATASET_NAME
=
"sst2"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Indicate if each sentence is Positive or Negative."
def
doc_to_text
(
self
,
doc
):
return
"sentence:
\t
{}
\t\n
answer:"
.
format
(
doc
[
"sentence"
],
)
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
({
1
:
"Positive"
,
0
:
"Negative"
}[
doc
[
"label"
]])
def
construct_requests
(
self
,
doc
,
ctx
):
ll_positive
,
_
=
rf
.
loglikelihood
(
ctx
,
" Positive"
)
ll_negative
,
_
=
rf
.
loglikelihood
(
ctx
,
" Negative"
)
return
ll_positive
,
ll_negative
def
process_results
(
self
,
doc
,
results
):
ll_positive
,
ll_negative
=
results
pred
=
ll_positive
>
ll_negative
gold
=
doc
[
"label"
]
return
{
"acc"
:
pred
==
gold
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
# Inference Tasks
class
MNLI
(
HFTask
):
DATASET_PATH
=
"glue"
...
...
@@ -104,27 +140,28 @@ class MNLI(HFTask):
# Neither = neutral
return
" {}"
.
format
({
0
:
"True"
,
1
:
"Neither"
,
2
:
"False"
}[
doc
[
"label"
]])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
def
construct_requests
(
self
,
doc
,
ctx
):
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
" True"
)
ll_neither
,
_
=
rf
.
loglikelihood
(
ctx
,
" Neither"
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
" False"
)
return
ll_true
,
ll_neither
,
ll_false
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
probs
=
np
.
array
([
lm
.
loglikelihood
(
ctx
,
' True'
),
lm
.
loglikelihood
(
ctx
,
' Neither'
),
lm
.
loglikelihood
(
ctx
,
' False'
),
])
preds
.
append
(
np
.
argmax
(
probs
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
def
process_results
(
self
,
doc
,
results
):
gold
=
doc
[
"label"
]
pred
=
np
.
argmax
(
results
)
return
{
"acc"
:
pred
==
gold
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
class
MNLIMismatched
(
MNLI
):
...
...
@@ -138,9 +175,9 @@ class MNLIMismatched(MNLI):
return
self
.
data
[
"test_mismatched"
]
class
MRPC
(
HFTask
):
class
QNLI
(
HFTask
):
DATASET_PATH
=
"glue"
DATASET_NAME
=
"
mrpc
"
DATASET_NAME
=
"
qnli
"
def
has_training_docs
(
self
):
return
True
...
...
@@ -151,36 +188,90 @@ class MRPC(HFTask):
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Indicate if both sentences mean the same thing."
def
doc_to_text
(
self
,
doc
):
return
"question:
\t
{}
\n
response:
\t
{}
\n
Does this answer the question, Yes or No?:"
.
format
(
doc
[
"question"
],
doc
[
"sentence"
],
)
def
doc_to_target
(
self
,
doc
):
# True = entailment
# False = not entailment
return
" {}"
.
format
({
0
:
"Yes"
,
1
:
"No"
}[
doc
[
"label"
]])
def
construct_requests
(
self
,
doc
,
ctx
):
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
" Yes"
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
" No"
)
return
ll_yes
,
ll_no
def
process_results
(
self
,
doc
,
results
):
ll_yes
,
ll_no
=
results
pred
=
ll_no
>
ll_yes
gold
=
doc
[
"label"
]
return
{
"acc"
:
pred
==
gold
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
class
WNLI
(
HFTask
):
DATASET_PATH
=
"glue"
DATASET_NAME
=
"wnli"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
doc_to_text
(
self
,
doc
):
return
"
sentence 1:
\t
{}
\n
sentence 2:
\t
{}
\n
answer:"
.
format
(
return
"
{}
\n
question:
\t
{}
\t
True, False or Neither?
\n
answer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence2"
],
)
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
(
yesno
(
doc
[
"label"
]))
# True = entailment
# False = contradiction
# Neither = neutral
return
" {}"
.
format
({
0
:
"True"
,
1
:
"Neither"
,
2
:
"False"
}[
doc
[
"label"
]])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
def
construct_requests
(
self
,
doc
,
ctx
):
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
" True"
)
ll_neither
,
_
=
rf
.
loglikelihood
(
ctx
,
" Neither"
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
" False"
)
return
ll_true
,
ll_neither
,
ll_false
def
process_results
(
self
,
doc
,
results
):
gold
=
doc
[
"label"
]
pred
=
np
.
argmax
(
results
)
return
{
"acc"
:
pred
==
gold
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
'yes'
)
>
lm
.
loglikelihood
(
ctx
,
'no'
))
return
get_accuracy_and_f1
(
preds
=
preds
,
golds
=
golds
)
class
RTE
(
HFTask
):
DATASET_PATH
=
"glue"
DATASET_NAME
=
"rte"
...
...
@@ -205,27 +296,36 @@ class RTE(HFTask):
# 1 = not_entailment
return
" {}"
.
format
({
0
:
"True"
,
1
:
"False"
}[
doc
[
"label"
]])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
def
construct_requests
(
self
,
doc
,
ctx
):
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
" True"
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
" False"
)
return
ll_true
,
ll_false
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' False'
)
>
lm
.
loglikelihood
(
ctx
,
' True'
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
def
process_results
(
self
,
doc
,
results
):
ll_true
,
ll_false
=
results
pred
=
ll_false
>
ll_true
gold
=
doc
[
"label"
]
return
{
"acc"
:
pred
==
gold
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
class
QNLI
(
HFTask
):
# Similarity and Paraphrase Tasks
class
MRPC
(
HFTask
):
DATASET_PATH
=
"glue"
DATASET_NAME
=
"
qnli
"
DATASET_NAME
=
"
mrpc
"
def
has_training_docs
(
self
):
return
True
...
...
@@ -236,33 +336,43 @@ class QNLI(HFTask):
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Indicate if both sentences mean the same thing."
def
doc_to_text
(
self
,
doc
):
return
"
question:
\t
{}
\n
response:
\t
{}
\n
Does this answer the question, Yes or No?
:"
.
format
(
doc
[
"
question
"
],
doc
[
"sentence"
],
return
"
sentence 1:
\t
{}
\n
sentence 2:
\t
{}
\n
answer
:"
.
format
(
doc
[
"
sentence1
"
],
doc
[
"sentence
2
"
],
)
def
doc_to_target
(
self
,
doc
):
# True = entailment
# False = not entailment
return
" {}"
.
format
({
0
:
"Yes"
,
1
:
"No"
}[
doc
[
"label"
]])
return
" {}"
.
format
(
yesno
(
doc
[
"label"
]))
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
def
construct_requests
(
self
,
doc
,
ctx
):
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
" yes"
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
" no"
)
return
ll_yes
,
ll_no
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' False'
)
>
lm
.
loglikelihood
(
ctx
,
' True'
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
def
process_results
(
self
,
doc
,
results
):
ll_yes
,
ll_no
=
results
gold
=
doc
[
"label"
]
pred
=
ll_yes
>
ll_no
return
{
"acc"
:
pred
==
gold
,
"f1"
:
(
gold
,
pred
),
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
,
"f1"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
,
"f1"
:
f1_score
}
class
QQP
(
HFTask
):
...
...
@@ -290,22 +400,31 @@ class QQP(HFTask):
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
(
yesno
(
doc
[
"label"
]))
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
def
construct_requests
(
self
,
doc
,
ctx
):
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
" yes"
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
" no"
)
return
ll_yes
,
ll_no
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' yes'
)
>
lm
.
loglikelihood
(
ctx
,
' no'
))
return
get_accuracy_and_f1
(
preds
=
preds
,
golds
=
golds
)
def
process_results
(
self
,
doc
,
results
):
ll_yes
,
ll_no
=
results
gold
=
doc
[
"label"
]
pred
=
ll_yes
>
ll_no
return
{
"acc"
:
pred
==
gold
,
"f1"
:
(
gold
,
pred
),
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
,
"f1"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
,
"f1"
:
f1_score
}
class
STSB
(
HFTask
):
...
...
@@ -368,93 +487,3 @@ class STSB(HFTask):
"minor"
:
minor
,
"higher_is_better"
:
True
,
}
class
SST
(
HFTask
):
DATASET_PATH
=
"glue"
DATASET_NAME
=
"sst2"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
return
"Indicate if each sentence is Positive or Negative."
def
doc_to_text
(
self
,
doc
):
return
"sentence:
\t
{}
\t\n
answer:"
.
format
(
doc
[
"sentence"
],
)
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
({
1
:
"Positive"
,
0
:
"Negative"
}[
doc
[
"label"
]])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
preds
.
append
(
lm
.
loglikelihood
(
ctx
,
' Positive'
)
>
lm
.
loglikelihood
(
ctx
,
' Negative'
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
class
WNLI
(
HFTask
):
DATASET_PATH
=
"glue"
DATASET_NAME
=
"wnli"
def
has_training_docs
(
self
):
return
True
def
has_validation_docs
(
self
):
return
True
def
has_test_docs
(
self
):
return
True
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
question:
\t
{}
\t
True, False or Neither?
\n
answer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence2"
],
)
def
doc_to_target
(
self
,
doc
):
# True = entailment
# False = contradiction
# Neither = neutral
return
" {}"
.
format
({
0
:
"True"
,
1
:
"Neither"
,
2
:
"False"
}[
doc
[
"label"
]])
def
evaluate
(
self
,
docs
,
lm
,
provide_description
,
num_fewshot
):
# TODO: Implement evaluation code using new framework
# ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework.
# For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py.
# Remove this comment when the evaluation code is implemented.
golds
=
[
doc
[
"label"
]
for
doc
in
docs
]
preds
=
[]
for
doc
in
tqdm_lib
.
tqdm
(
docs
):
ctx
=
self
.
fewshot_context
(
doc
=
doc
,
provide_description
=
provide_description
,
num_fewshot
=
num_fewshot
,
)
probs
=
np
.
array
([
lm
.
loglikelihood
(
ctx
,
' True'
),
lm
.
loglikelihood
(
ctx
,
' Neither'
),
lm
.
loglikelihood
(
ctx
,
' False'
),
])
preds
.
append
(
np
.
argmax
(
probs
))
return
simple_accuracy_metric
(
preds
=
preds
,
golds
=
golds
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment