Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
60a6fd8c
Commit
60a6fd8c
authored
Jan 27, 2021
by
Leo Gao
Browse files
Implement unit testing and fix lots of problems with tasks
parent
693c19e2
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
103 additions
and
66 deletions
+103
-66
lm_eval/models/dummy.py
lm_eval/models/dummy.py
+18
-6
lm_eval/models/gpt2.py
lm_eval/models/gpt2.py
+0
-1
lm_eval/models/gpt3.py
lm_eval/models/gpt3.py
+2
-0
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+19
-17
lm_eval/tasks/arithmetic.py
lm_eval/tasks/arithmetic.py
+1
-1
lm_eval/tasks/common.py
lm_eval/tasks/common.py
+2
-0
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+1
-1
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+6
-9
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+2
-0
lm_eval/tasks/storycloze.py
lm_eval/tasks/storycloze.py
+3
-3
lm_eval/tasks/superglue.py
lm_eval/tasks/superglue.py
+34
-13
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+8
-10
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+7
-5
No files found.
lm_eval/models/dummy.py
View file @
60a6fd8c
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import
random
from
lm_eval.base
import
LM
from
lm_eval.base
import
LM
from
.
import
MODEL_REGISTRY
@
MODEL_REGISTRY
.
register
(
"dummy"
)
class
DummyLM
(
LM
):
class
DummyLM
(
LM
):
def
__init__
(
self
):
pass
@
classmethod
def
create_from_arg_string
(
cls
,
arg_string
):
return
cls
()
def
loglikelihood
(
self
,
requests
):
res
=
[]
for
_
in
requests
:
res
.
append
((
-
random
.
random
(),
False
))
return
res
def
loglikelihood
(
self
,
context
,
continuation
):
def
greedy_until
(
self
,
requests
):
return
0.0
# TODO: implement
pass
\ No newline at end of file
lm_eval/models/gpt2.py
View file @
60a6fd8c
...
@@ -19,7 +19,6 @@ class GPT2LM(LM):
...
@@ -19,7 +19,6 @@ class GPT2LM(LM):
return
cls
(
device
=
args
.
get
(
"device"
,
"cpu"
))
return
cls
(
device
=
args
.
get
(
"device"
,
"cpu"
))
def
loglikelihood
(
self
,
requests
):
def
loglikelihood
(
self
,
requests
):
print
(
requests
)
res
=
[]
res
=
[]
# TODO: vectorize properly
# TODO: vectorize properly
for
context
,
continuation
in
tqdm
(
requests
):
for
context
,
continuation
in
tqdm
(
requests
):
...
...
lm_eval/models/gpt3.py
View file @
60a6fd8c
...
@@ -32,6 +32,8 @@ class GPT3LM(LM):
...
@@ -32,6 +32,8 @@ class GPT3LM(LM):
return
cls
(
engine
=
args
.
get
(
"engine"
,
"davinci"
))
return
cls
(
engine
=
args
.
get
(
"engine"
,
"davinci"
))
def
loglikelihood
(
self
,
context
,
continuation
):
def
loglikelihood
(
self
,
context
,
continuation
):
# TODO: implement new framework
import
openai
import
openai
context_enc
=
self
.
tokenizer
.
encode
(
context
)
context_enc
=
self
.
tokenizer
.
encode
(
context
)
...
...
lm_eval/tasks/__init__.py
View file @
60a6fd8c
...
@@ -23,7 +23,7 @@ TASK_REGISTRY = {
...
@@ -23,7 +23,7 @@ TASK_REGISTRY = {
"rte"
:
glue
.
RTE
,
"rte"
:
glue
.
RTE
,
"qnli"
:
glue
.
QNLI
,
"qnli"
:
glue
.
QNLI
,
"qqp"
:
glue
.
QQP
,
"qqp"
:
glue
.
QQP
,
"stsb"
:
glue
.
STSB
,
#
"stsb": glue.STSB,
# not implemented yet
"sst"
:
glue
.
SST
,
"sst"
:
glue
.
SST
,
"wnli"
:
glue
.
WNLI
,
"wnli"
:
glue
.
WNLI
,
# SuperGLUE
# SuperGLUE
...
@@ -33,23 +33,25 @@ TASK_REGISTRY = {
...
@@ -33,23 +33,25 @@ TASK_REGISTRY = {
"multirc"
:
superglue
.
MultiRC
,
"multirc"
:
superglue
.
MultiRC
,
"record"
:
superglue
.
ReCoRD
,
"record"
:
superglue
.
ReCoRD
,
"wic"
:
superglue
.
WordsInContext
,
"wic"
:
superglue
.
WordsInContext
,
"wsc"
:
superglue
.
SGWinogradSchemaChallenge
,
#"wsc": superglue.SGWinogradSchemaChallenge, # not implemented yet
# Order by benchmark/genre?
# Order by benchmark/genre?
"arc_easy"
:
arc
.
ARCEasy
,
"arc_challenge"
:
arc
.
ARCChallenge
,
# "arc_easy": arc.ARCEasy, # not implemented yet
"quac"
:
quac
.
QuAC
,
# "arc_challenge": arc.ARCChallenge, # not implemented yet
"hellaswag"
:
hellaswag
.
HellaSwag
,
# "quac": quac.QuAC, # not implemented yet
"openbookqa"
:
openbookqa
.
OpenBookQA
,
# "hellaswag": hellaswag.HellaSwag, # not implemented yet
"sat"
:
sat
.
SATAnalogies
,
# "openbookqa": openbookqa.OpenBookQA, # not implemented yet
"squad"
:
squad
.
SQuAD
,
# "sat": sat.SATAnalogies, # not implemented yet
"race"
:
race
.
RACE
,
# "squad": squad.SQuAD, # not implemented yet
"naturalqs"
:
naturalqs
.
NaturalQs
,
# "race": race.RACE, # not implemented yet
"webqs"
:
webqs
.
WebQs
,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
"wsc273"
:
wsc273
.
WinogradSchemaChallenge273
,
# "webqs": webqs.WebQs, # not implemented yet
"winogrande"
:
winogrande
.
Winogrande
,
# "wsc273": wsc273.WinogradSchemaChallenge273, # not implemented yet
"anli_r1"
:
anli
.
ANLIRound1
,
# "winogrande": winogrande.Winogrande, # not implemented yet
"anli_r2"
:
anli
.
ANLIRound2
,
# "anli_r1": anli.ANLIRound1, # not implemented yet
"anli_r3"
:
anli
.
ANLIRound3
,
# "anli_r2": anli.ANLIRound2, # not implemented yet
# "anli_r3": anli.ANLIRound3, # not implemented yet
# arithmetic
# arithmetic
"arithmetic_2da"
:
arithmetic
.
Arithmetic2DPlus
,
"arithmetic_2da"
:
arithmetic
.
Arithmetic2DPlus
,
"arithmetic_2ds"
:
arithmetic
.
Arithmetic2DMinus
,
"arithmetic_2ds"
:
arithmetic
.
Arithmetic2DMinus
,
...
...
lm_eval/tasks/arithmetic.py
View file @
60a6fd8c
...
@@ -12,7 +12,6 @@ class Arithmetic(Dataset):
...
@@ -12,7 +12,6 @@ class Arithmetic(Dataset):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
self
.
set_docs
()
def
download
(
self
):
def
download
(
self
):
file_name
,
checksum
=
self
.
get_file_download_info
()
file_name
,
checksum
=
self
.
get_file_download_info
()
...
@@ -20,6 +19,7 @@ class Arithmetic(Dataset):
...
@@ -20,6 +19,7 @@ class Arithmetic(Dataset):
if
not
os
.
path
.
exists
(
self
.
directory
):
if
not
os
.
path
.
exists
(
self
.
directory
):
os
.
makedirs
(
self
.
directory
)
os
.
makedirs
(
self
.
directory
)
download_file
(
url
,
self
.
directory
+
file_name
,
checksum
)
download_file
(
url
,
self
.
directory
+
file_name
,
checksum
)
self
.
set_docs
()
@
abc
.
abstractmethod
@
abc
.
abstractmethod
def
get_file_download_info
(
self
):
def
get_file_download_info
(
self
):
...
...
lm_eval/tasks/common.py
View file @
60a6fd8c
...
@@ -11,6 +11,8 @@ class HFTask(Dataset):
...
@@ -11,6 +11,8 @@ class HFTask(Dataset):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
self
.
_training_docs
=
None
self
.
_training_docs
=
None
def
download
(
self
):
self
.
data
=
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
name
=
self
.
DATASET_NAME
)
self
.
data
=
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
name
=
self
.
DATASET_NAME
)
def
has_training_docs
(
self
):
def
has_training_docs
(
self
):
...
...
lm_eval/tasks/drop.py
View file @
60a6fd8c
...
@@ -11,7 +11,7 @@ class DROP(Dataset):
...
@@ -11,7 +11,7 @@ class DROP(Dataset):
DATAFOLDER
=
Path
(
__file__
).
parent
/
"../../data/drop"
DATAFOLDER
=
Path
(
__file__
).
parent
/
"../../data/drop"
def
__init__
(
self
):
def
__init__
(
self
):
s
elf
.
download
()
s
uper
().
__init__
()
def
has_training_docs
(
self
):
def
has_training_docs
(
self
):
"""Whether the task has a training set"""
"""Whether the task has a training set"""
...
...
lm_eval/tasks/race.py
View file @
60a6fd8c
...
@@ -54,16 +54,13 @@ class RACE(HFTask):
...
@@ -54,16 +54,13 @@ class RACE(HFTask):
# TODO: figure out description
# TODO: figure out description
return
""
return
""
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
def
doc_to_text
(
self
,
doc
):
r
=
"Article:
\n
"
+
doc
[
'article'
]
+
'
\n\n
'
# TODO: implement
pass
r
+=
doc
[
'problems'
]
>>
apply
(
enumerate
)
>>
each
(
def
doc_to_target
(
self
,
doc
):
lambda
x
:
'Q: '
+
x
[
1
][
'question'
]
+
'
\n\n
A:'
# TODO: implement
+
((
' '
+
x
[
1
][
'options'
][[
'A'
,
'B'
,
'C'
,
'D'
].
index
(
x
[
1
][
'answer'
])])
\
pass
if
x
[
0
]
!=
len
(
doc
[
'problems'
])
-
1
or
include_target
else
''
))
\
>>
join
(
'
\n\n
'
)
return
r
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
):
""" Uses RequestFactory to construct Requests and returns an iterable of
""" Uses RequestFactory to construct Requests and returns an iterable of
...
...
lm_eval/tasks/sat.py
View file @
60a6fd8c
...
@@ -9,6 +9,8 @@ from ..utils import sh
...
@@ -9,6 +9,8 @@ from ..utils import sh
class
SATAnalogies
(
Dataset
):
class
SATAnalogies
(
Dataset
):
NEEDS_MANUAL_DL
=
True
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
...
...
lm_eval/tasks/storycloze.py
View file @
60a6fd8c
...
@@ -5,8 +5,8 @@ from ..utils import sh
...
@@ -5,8 +5,8 @@ from ..utils import sh
import
csv
import
csv
class
StoryCloze
(
Dataset
):
class
StoryCloze
(
Dataset
):
def
__init__
(
self
):
NEEDS_MANUAL_DL
=
True
self
.
download
()
def
download
(
self
):
def
download
(
self
):
#TODO: replace with Eye link
#TODO: replace with Eye link
pass
pass
...
...
lm_eval/tasks/superglue.py
View file @
60a6fd8c
...
@@ -75,6 +75,7 @@ class CommitmentBank(HFTask):
...
@@ -75,6 +75,7 @@ class CommitmentBank(HFTask):
return
True
return
True
def
fewshot_description
(
self
):
def
fewshot_description
(
self
):
# TODO: figure out actual description
return
"Given a premise and a hypothesis, classify whether the author of the premise is committed"
\
return
"Given a premise and a hypothesis, classify whether the author of the premise is committed"
\
"to the truth of the hypothesis. The three possible labels are true, false or neither."
"to the truth of the hypothesis. The three possible labels are true, false or neither."
...
@@ -145,6 +146,7 @@ class Copa(HFTask):
...
@@ -145,6 +146,7 @@ class Copa(HFTask):
return
True
return
True
def
fewshot_description
(
self
):
def
fewshot_description
(
self
):
# TODO: figure out actual description
return
"Given a premise and one alternative with a causal relation to the premise and another without,"
\
return
"Given a premise and one alternative with a causal relation to the premise and another without,"
\
"choose the more plausible alternative"
"choose the more plausible alternative"
...
@@ -208,6 +210,7 @@ class MultiRC(HFTask):
...
@@ -208,6 +210,7 @@ class MultiRC(HFTask):
return
True
return
True
def
fewshot_description
(
self
):
def
fewshot_description
(
self
):
# TODO: figure out actual description
return
"READING COMPREHENSION ANSWER KEY"
return
"READING COMPREHENSION ANSWER KEY"
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
...
@@ -260,13 +263,16 @@ class ReCoRD(HFTask):
...
@@ -260,13 +263,16 @@ class ReCoRD(HFTask):
def
has_test_docs
(
self
):
def
has_test_docs
(
self
):
return
True
return
True
def
fewshot_description
(
self
):
# TODO: figure out actual description
return
""
def
training_docs
(
self
):
def
training_docs
(
self
):
# In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
# In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
# Each doc consists of multiple answer candidates, each of which is scored yes/no.
# Each doc consists of multiple answer candidates, each of which is scored yes/no.
# Hence, we one "doc" for each (context + passage, answer) pair.
# Hence, we one "doc" for each (context + passage, answer) pair.
# Moreover, we only use the correct answers for context packing
# Moreover, we only use the correct answers for context packing
# (This is not an issue for evaluation, where we can directly score multiple candidates at once).
# (This is not an issue for evaluation, where we can directly score multiple candidates at once).
if
self
.
has_training_docs
():
if
self
.
_training_docs
is
None
:
if
self
.
_training_docs
is
None
:
self
.
_training_docs
=
[]
self
.
_training_docs
=
[]
for
doc
in
self
.
data
[
"train"
]:
for
doc
in
self
.
data
[
"train"
]:
...
@@ -279,6 +285,16 @@ class ReCoRD(HFTask):
...
@@ -279,6 +285,16 @@ class ReCoRD(HFTask):
})
})
return
self
.
_training_docs
return
self
.
_training_docs
def
validation_docs
(
self
):
for
doc
in
self
.
data
[
"validation"
]:
for
entity
in
list
(
set
(
doc
[
"entities"
])):
yield
{
"passage"
:
doc
[
"passage"
],
"query"
:
doc
[
"query"
],
"entity"
:
entity
,
"label"
:
entity
in
doc
[
"answers"
],
}
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
initial_text
,
*
highlights
=
doc
[
"passage"
].
strip
().
split
(
"
\n
@highlight
\n
"
)
initial_text
,
*
highlights
=
doc
[
"passage"
].
strip
().
split
(
"
\n
@highlight
\n
"
)
text
=
initial_text
+
"
\n\n
"
text
=
initial_text
+
"
\n\n
"
...
@@ -296,7 +312,7 @@ class ReCoRD(HFTask):
...
@@ -296,7 +312,7 @@ class ReCoRD(HFTask):
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
):
requests
=
[
requests
=
[
rf
.
loglikelihood
(
ctx
,
self
.
format_answer
(
query
=
doc
[
"query"
],
entity
=
entity
))
rf
.
loglikelihood
(
ctx
,
self
.
format_answer
(
query
=
doc
[
"query"
],
entity
=
entity
))
for
entity
in
doc
[
"entit
ies
"
]
for
entity
in
doc
[
"entit
y
"
]
]
]
return
requests
return
requests
...
@@ -342,6 +358,10 @@ class WordsInContext(HFTask):
...
@@ -342,6 +358,10 @@ class WordsInContext(HFTask):
def
has_test_docs
(
self
):
def
has_test_docs
(
self
):
return
True
return
True
def
fewshot_description
(
self
):
# TODO: figure out actual description
return
""
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
{}
\n
Question: Is the word '{}' used in the same way in the"
\
return
"{}
\n
{}
\n
Question: Is the word '{}' used in the same way in the"
\
" two sentences above?
\n
answer:"
.
format
(
" two sentences above?
\n
answer:"
.
format
(
...
@@ -405,6 +425,7 @@ class SGWinogradSchemaChallenge(HFTask):
...
@@ -405,6 +425,7 @@ class SGWinogradSchemaChallenge(HFTask):
return
self
.
_training_docs
return
self
.
_training_docs
def
fewshot_description
(
self
):
def
fewshot_description
(
self
):
# TODO: figure out actual description
return
"Final Exam with Answer Key
\n
"
\
return
"Final Exam with Answer Key
\n
"
\
"Instructions: Please carefully read the following passages. "
\
"Instructions: Please carefully read the following passages. "
\
"For each passage, you must identify which noun the pronoun marked in *bold*"
\
"For each passage, you must identify which noun the pronoun marked in *bold*"
\
...
...
lm_eval/tasks/triviaqa.py
View file @
60a6fd8c
import
os
import
json
import
json
import
random
import
random
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Dataset
from
..utils
import
sh
from
..utils
import
sh
class
TriviaQA
(
Dataset
):
class
TriviaQA
(
Dataset
):
def
__init__
(
self
):
self
.
download
()
def
download
(
self
):
def
download
(
self
):
#pass
if
not
os
.
path
.
exists
(
'data/triviaqa'
):
#TODO: don't download if files already there
sh
(
"""
sh
(
"""
mkdir -p data/triviaqa
mkdir -p data/triviaqa
wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz -O data/triviaqa/trivia_qa-unfiltered.tar.gz
wget http://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz -O data/triviaqa/trivia_qa-unfiltered.tar.gz
...
...
lm_eval/tasks/wsc273.py
View file @
60a6fd8c
...
@@ -72,11 +72,13 @@ class WinogradSchemaChallenge273(Dataset):
...
@@ -72,11 +72,13 @@ class WinogradSchemaChallenge273(Dataset):
return
docs
return
docs
def
doc_to_text
(
self
,
doc
,
include_target
=
True
):
def
doc_to_text
(
self
,
doc
):
# WSC273 is currently only writing out full examples. Partial evaluation needs implementing.
# TODO: implement
text
=
doc
[
'completions'
][
'T'
]
+
' True. '
+
doc
[
'completions'
][
'F'
]
+
' False.'
pass
return
text
def
doc_to_target
(
self
,
doc
):
# TODO: implement
pass
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
):
""" Uses RequestFactory to construct Requests and returns an iterable of
""" Uses RequestFactory to construct Requests and returns an iterable of
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment