Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
19b0f529
Unverified
Commit
19b0f529
authored
Feb 02, 2021
by
Leo Gao
Committed by
GitHub
Feb 02, 2021
Browse files
Merge pull request #111 from jon-tow/wsc273-evaluation
Implement `WSC273` evaluation and data processing
parents
e12d0078
bc5495d2
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
134 additions
and
127 deletions
+134
-127
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+1
-1
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+133
-126
No files found.
lm_eval/tasks/__init__.py
View file @
19b0f529
...
@@ -57,7 +57,7 @@ TASK_REGISTRY = {
...
@@ -57,7 +57,7 @@ TASK_REGISTRY = {
"race"
:
race
.
RACE
,
"race"
:
race
.
RACE
,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
"webqs"
:
webqs
.
WebQs
,
"webqs"
:
webqs
.
WebQs
,
#
"wsc273": wsc273.WinogradSchemaChallenge273,
# not implemented yet
"wsc273"
:
wsc273
.
WinogradSchemaChallenge273
,
# "winogrande": winogrande.Winogrande, # not implemented yet
# "winogrande": winogrande.Winogrande, # not implemented yet
"anli_r1"
:
anli
.
ANLIRound1
,
"anli_r1"
:
anli
.
ANLIRound1
,
"anli_r2"
:
anli
.
ANLIRound2
,
"anli_r2"
:
anli
.
ANLIRound2
,
...
...
lm_eval/tasks/wsc273.py
View file @
19b0f529
import
json
import
numpy
as
np
import
random
import
random
import
os
from
lm_eval.base
import
rf
,
mean
from
lm_eval.base
import
Task
from
.
common
import
HFTask
from
..utils
import
sh
"""
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
"""
class
WinogradSchemaChallenge273
(
HFTask
):
DATASET_PATH
=
"winograd_wsc"
DATASET_NAME
=
"wsc273"
upper_pronouns
=
[
"A"
,
"An"
,
"The"
,
"She"
,
"He"
,
"It"
,
"They"
,
"My"
,
"His"
,
"Her"
,
"Their"
]
class
WinogradSchemaChallenge273
(
Task
):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
self
.
data
=
self
.
__clean_data
()
def
download
(
self
):
if
not
os
.
path
.
exists
(
'data/wsc273'
):
def
__clean_data
(
self
):
sh
(
"""
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
mkdir -p data/wsc273
data
=
[]
wget https://git.cse.msu.edu/bakerb15/nlp-final-project/raw/master/Winogard/reproduce/commonsense_test/wsc273.json -O data/wsc273/wsc273.json
for
doc
in
self
.
data
[
"test"
]:
"""
)
doc
[
"text"
]
=
doc
[
"text"
].
replace
(
" "
,
" "
)
doc
[
"options"
][
0
]
=
self
.
__normalize_option
(
doc
[
"options"
][
0
],
doc
)
doc
[
"options"
][
1
]
=
self
.
__normalize_option
(
doc
[
"options"
][
1
],
doc
)
data
.
append
(
doc
)
return
{
"test"
:
data
}
def
__normalize_option
(
self
,
option
,
doc
):
# Append `'s` to possessive determiner based options.
if
doc
[
"pronoun"
].
lower
()
in
[
"my"
,
"his"
,
"her"
,
"our"
,
"their"
]:
option
+=
"'s"
# Appropriately lowercase the pronoun in the option.
pronoun
=
option
.
split
()[
0
]
start_of_sentence
=
doc
[
"text"
][
doc
[
'pronoun_loc'
]
-
2
]
==
'.'
if
not
start_of_sentence
and
pronoun
in
self
.
upper_pronouns
:
return
option
.
replace
(
pronoun
,
pronoun
.
lower
())
return
option
def
has_training_docs
(
self
):
def
has_training_docs
(
self
):
return
False
return
False
...
@@ -25,60 +51,35 @@ class WinogradSchemaChallenge273(Task):
...
@@ -25,60 +51,35 @@ class WinogradSchemaChallenge273(Task):
def
has_test_docs
(
self
):
def
has_test_docs
(
self
):
return
True
return
True
def
training_docs
(
self
):
def
fewshot_examples
(
self
,
k
):
return
[]
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
def
validation_docs
(
self
):
return
random
.
sample
(
list
(
self
.
test_docs
()),
k
)
return
[]
def
test_docs
(
self
):
myjson
=
json
.
load
(
open
(
'data/wsc273/wsc273.json'
))
return
self
.
load_doc
(
myjson
)
def
fewshot_description
(
self
):
def
fewshot_description
(
self
):
# TODO: redo description
# TODO: redo description
return
"Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
return
"Winograd schema sentence with correct continuation. True. Winograd schema sentence with incorrect continuation. False."
def
load_doc
(
self
,
myjson
):
@
classmethod
docs
=
[]
def
partial_context
(
cls
,
doc
):
for
i
in
range
(
0
,
273
*
2
,
2
):
# Substitute the pronoun in the original text with each candidate
item1
=
myjson
[
i
]
# choice and ignore everything after.
item2
=
myjson
[
i
+
1
]
context1
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
0
]
context2
=
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
doc
[
"options"
][
1
]
if
item1
[
'question_id'
]
!=
item2
[
'question_id'
]:
return
context1
,
context2
raise
ValueError
(
"WSC273 has missing completion pair."
)
question_id
=
item1
[
'question_id'
]
if
item1
[
'correctness'
]
==
True
:
@
classmethod
doc
=
{
def
partial_target
(
cls
,
doc
):
'id'
:
question_id
,
# The target is everything after the document specified pronoun.
'completions'
:
{
start_index
=
doc
[
"pronoun_loc"
]
+
len
(
doc
[
"pronoun"
])
'T'
:
item1
[
'substitution'
],
return
doc
[
"text"
][
start_index
:].
strip
()
'F'
:
item2
[
'substitution'
],
},
}
if
item2
[
'correctness'
]
==
True
:
doc
=
{
'id'
:
question_id
,
'completions'
:
{
'F'
:
item1
[
'substitution'
],
'T'
:
item2
[
'substitution'
],
},
}
docs
.
append
(
doc
)
return
docs
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
# TODO: implement
context1
,
context2
=
self
.
partial_context
(
doc
)
pass
return
context1
+
'
\n
'
+
context2
+
'
\n
'
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
# TODO: implement
return
self
.
partial_target
(
doc
)
pass
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
):
""" Uses RequestFactory to construct Requests and returns an iterable of
""" Uses RequestFactory to construct Requests and returns an iterable of
...
@@ -91,8 +92,11 @@ class WinogradSchemaChallenge273(Task):
...
@@ -91,8 +92,11 @@ class WinogradSchemaChallenge273(Task):
language description, as well as the few shot examples, and the question
language description, as well as the few shot examples, and the question
part of the document for `doc`.
part of the document for `doc`.
"""
"""
# TODO: implement evaluation.
target
=
self
.
partial_target
(
doc
)
raise
NotImplementedError
(
'Evaluation not implemented'
)
context1
,
context2
=
self
.
partial_context
(
doc
)
ll_context1
,
_
=
rf
.
loglikelihood
(
context1
,
" "
+
target
)
ll_context2
,
_
=
rf
.
loglikelihood
(
context2
,
" "
+
target
)
return
ll_context1
,
ll_context2
def
process_results
(
self
,
doc
,
results
):
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
"""Take a single document and the LM results and evaluates, returning a
...
@@ -104,8 +108,9 @@ class WinogradSchemaChallenge273(Task):
...
@@ -104,8 +108,9 @@ class WinogradSchemaChallenge273(Task):
:param results:
:param results:
The results of the requests created in construct_requests.
The results of the requests created in construct_requests.
"""
"""
# TODO: implement evaluation.
return
{
raise
NotImplementedError
(
'Evaluation not implemented'
)
"acc"
:
np
.
argmax
(
results
)
==
doc
[
"label"
]
}
def
aggregation
(
self
):
def
aggregation
(
self
):
"""
"""
...
@@ -113,8 +118,9 @@ class WinogradSchemaChallenge273(Task):
...
@@ -113,8 +118,9 @@ class WinogradSchemaChallenge273(Task):
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
functions that aggregate a list of metrics
"""
"""
# TODO: implement evaluation.
return
{
raise
NotImplementedError
(
'Evaluation not implemented'
)
"acc"
:
mean
}
def
higher_is_better
(
self
):
def
higher_is_better
(
self
):
"""
"""
...
@@ -122,5 +128,6 @@ class WinogradSchemaChallenge273(Task):
...
@@ -122,5 +128,6 @@ class WinogradSchemaChallenge273(Task):
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
whether a higher value of the submetric is better
"""
"""
# TODO: implement evaluation.
return
{
raise
NotImplementedError
(
'Evaluation not implemented'
)
"acc"
:
True
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment