Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
dae7b868
Commit
dae7b868
authored
Feb 05, 2022
by
Quentin Gregory Anthony
Browse files
Added decontamination to remaining evals
parent
341663a9
Changes
33
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
144 additions
and
1 deletion
+144
-1
lm_eval/tasks/anli.py
lm_eval/tasks/anli.py
+6
-0
lm_eval/tasks/arc.py
lm_eval/tasks/arc.py
+6
-0
lm_eval/tasks/arithmetic.py
lm_eval/tasks/arithmetic.py
+6
-0
lm_eval/tasks/asdiv.py
lm_eval/tasks/asdiv.py
+6
-0
lm_eval/tasks/blimp.py
lm_eval/tasks/blimp.py
+6
-0
lm_eval/tasks/cbt.py
lm_eval/tasks/cbt.py
+7
-0
lm_eval/tasks/coqa.py
lm_eval/tasks/coqa.py
+6
-0
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+6
-0
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+6
-0
lm_eval/tasks/headqa.py
lm_eval/tasks/headqa.py
+7
-1
lm_eval/tasks/hendrycks_ethics.py
lm_eval/tasks/hendrycks_ethics.py
+24
-0
lm_eval/tasks/hendrycks_math.py
lm_eval/tasks/hendrycks_math.py
+6
-0
lm_eval/tasks/hendrycks_test.py
lm_eval/tasks/hendrycks_test.py
+7
-0
lm_eval/tasks/lambada_cloze.py
lm_eval/tasks/lambada_cloze.py
+6
-0
lm_eval/tasks/mc_taco.py
lm_eval/tasks/mc_taco.py
+6
-0
lm_eval/tasks/mutual.py
lm_eval/tasks/mutual.py
+6
-0
lm_eval/tasks/naturalqs.py
lm_eval/tasks/naturalqs.py
+6
-0
lm_eval/tasks/openbookqa.py
lm_eval/tasks/openbookqa.py
+7
-0
lm_eval/tasks/prost.py
lm_eval/tasks/prost.py
+7
-0
lm_eval/tasks/qa4mre.py
lm_eval/tasks/qa4mre.py
+7
-0
No files found.
lm_eval/tasks/anli.py
View file @
dae7b868
...
@@ -40,6 +40,12 @@ class ANLIBase(HFTask):
...
@@ -40,6 +40,12 @@ class ANLIBase(HFTask):
# want to do it exactly as OA did?
# want to do it exactly as OA did?
return
doc
[
'premise'
]
+
'
\n
Question: '
+
doc
[
'hypothesis'
]
+
' True, False, or Neither?
\n
Answer:'
return
doc
[
'premise'
]
+
'
\n
Question: '
+
doc
[
'hypothesis'
]
+
' True, False, or Neither?
\n
Answer:'
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"premise"
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
# True = entailment
# True = entailment
# False = contradiction
# False = contradiction
...
...
lm_eval/tasks/arc.py
View file @
dae7b868
...
@@ -32,6 +32,12 @@ class ARCEasy(HFTask, MultipleChoiceTask):
...
@@ -32,6 +32,12 @@ class ARCEasy(HFTask, MultipleChoiceTask):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
return
doc
[
"query"
]
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"query"
]
class
ARCChallenge
(
ARCEasy
):
class
ARCChallenge
(
ARCEasy
):
DATASET_PATH
=
"ai2_arc"
DATASET_PATH
=
"ai2_arc"
...
...
lm_eval/tasks/arithmetic.py
View file @
dae7b868
...
@@ -55,6 +55,12 @@ class Arithmetic(Task):
...
@@ -55,6 +55,12 @@ class Arithmetic(Task):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
doc
.
context
return
doc
.
context
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
.
context
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
doc
.
completion
return
doc
.
completion
...
...
lm_eval/tasks/asdiv.py
View file @
dae7b868
...
@@ -93,6 +93,12 @@ class Asdiv(Task):
...
@@ -93,6 +93,12 @@ class Asdiv(Task):
# TODO: add solution-type
# TODO: add solution-type
return
doc
[
'body'
]
+
'
\n
'
+
'Question:'
+
doc
[
'question'
]
+
'
\n
'
+
'Answer:'
return
doc
[
'body'
]
+
'
\n
'
+
'Question:'
+
doc
[
'question'
]
+
'
\n
'
+
'Answer:'
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
'body'
]
+
" "
+
doc
[
'question'
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
# TODO: add formula
# TODO: add formula
...
...
lm_eval/tasks/blimp.py
View file @
dae7b868
...
@@ -47,6 +47,12 @@ class BlimpTask(HFTask):
...
@@ -47,6 +47,12 @@ class BlimpTask(HFTask):
# this method is invoked by tests only
# this method is invoked by tests only
return
""
return
""
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"sentence_good"
]
+
" "
+
doc
[
"sentence_bad"
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
# this method is invoked by tests only
# this method is invoked by tests only
return
""
return
""
...
...
lm_eval/tasks/cbt.py
View file @
dae7b868
...
@@ -38,6 +38,13 @@ class CBTBase(HFTask):
...
@@ -38,6 +38,13 @@ class CBTBase(HFTask):
text
=
"Passage: "
+
passage
+
"
\n
Question: "
+
doc
[
"question"
]
text
=
"Passage: "
+
passage
+
"
\n
Question: "
+
doc
[
"question"
]
return
self
.
detokenize
(
text
)
return
self
.
detokenize
(
text
)
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
passage
=
" "
.
join
(
doc
[
"sentences"
])
return
passage
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
""
return
""
...
...
lm_eval/tasks/coqa.py
View file @
dae7b868
...
@@ -47,6 +47,12 @@ class CoQA(Task):
...
@@ -47,6 +47,12 @@ class CoQA(Task):
doc_text
+=
question
+
answer
doc_text
+=
question
+
answer
return
doc_text
return
doc_text
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"story"
]
+
" "
+
doc
[
"questions"
]
@
classmethod
@
classmethod
def
get_answers
(
cls
,
doc
,
turn_id
):
def
get_answers
(
cls
,
doc
,
turn_id
):
# Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
# Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
...
...
lm_eval/tasks/drop.py
View file @
dae7b868
...
@@ -87,6 +87,12 @@ class DROP(Task):
...
@@ -87,6 +87,12 @@ class DROP(Task):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
f
"Passage:
{
doc
[
'passage'
]
}
\n
Question:
{
doc
[
'question'
]
}
\n
Answer:"
return
f
"Passage:
{
doc
[
'passage'
]
}
\n
Question:
{
doc
[
'question'
]
}
\n
Answer:"
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
'passage'
]
+
" "
+
doc
[
'question'
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" "
+
", "
.
join
(
doc
[
"answers"
][
0
])
return
" "
+
", "
.
join
(
doc
[
"answers"
][
0
])
...
...
lm_eval/tasks/glue.py
View file @
dae7b868
...
@@ -24,6 +24,12 @@ class CoLA(HFTask):
...
@@ -24,6 +24,12 @@ class CoLA(HFTask):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
Question: Does this sentence make sense?
\n
Answer:"
.
format
(
doc
[
"sentence"
])
return
"{}
\n
Question: Does this sentence make sense?
\n
Answer:"
.
format
(
doc
[
"sentence"
])
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"sentence"
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
({
1
:
"yes"
,
0
:
"no"
}[
doc
[
"label"
]])
return
" {}"
.
format
({
1
:
"yes"
,
0
:
"no"
}[
doc
[
"label"
]])
...
...
lm_eval/tasks/headqa.py
View file @
dae7b868
...
@@ -27,6 +27,12 @@ class HeadQABase(HFTask, MultipleChoiceTask):
...
@@ -27,6 +27,12 @@ class HeadQABase(HFTask, MultipleChoiceTask):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
return
doc
[
"query"
]
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"query"
]
class
HeadQAEn
(
HeadQABase
):
class
HeadQAEn
(
HeadQABase
):
DATASET_NAME
=
"en"
DATASET_NAME
=
"en"
...
...
lm_eval/tasks/hendrycks_ethics.py
View file @
dae7b868
...
@@ -98,6 +98,12 @@ class EthicsCM(Ethics):
...
@@ -98,6 +98,12 @@ class EthicsCM(Ethics):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
Question: Is this wrong?
\n
Answer:"
.
format
(
doc
[
1
])
return
"{}
\n
Question: Is this wrong?
\n
Answer:"
.
format
(
doc
[
1
])
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
1
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
(
yesno
(
int
(
doc
[
0
])))
return
" {}"
.
format
(
yesno
(
int
(
doc
[
0
])))
...
@@ -138,6 +144,12 @@ class EthicsDeontology(Ethics):
...
@@ -138,6 +144,12 @@ class EthicsDeontology(Ethics):
prompt
=
" "
.
join
([
doc
[
1
],
doc
[
2
]])
prompt
=
" "
.
join
([
doc
[
1
],
doc
[
2
]])
return
"Question: Would most people believe this reasonable or unreasonable to say?
\"
{}
\"\n
Answer:"
.
format
(
prompt
)
return
"Question: Would most people believe this reasonable or unreasonable to say?
\"
{}
\"\n
Answer:"
.
format
(
prompt
)
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
" "
.
join
([
doc
[
1
],
doc
[
2
]])
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
target
=
[
"unreasonable"
,
"reasonable"
][
int
(
doc
[
0
])]
target
=
[
"unreasonable"
,
"reasonable"
][
int
(
doc
[
0
])]
return
" {}"
.
format
(
target
)
return
" {}"
.
format
(
target
)
...
@@ -187,6 +199,12 @@ class EthicsJustice(Ethics):
...
@@ -187,6 +199,12 @@ class EthicsJustice(Ethics):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"Question: Would most people believe this reasonable or unreasonable to say?
\"
{}
\"\n
Answer:"
.
format
(
doc
[
1
])
return
"Question: Would most people believe this reasonable or unreasonable to say?
\"
{}
\"\n
Answer:"
.
format
(
doc
[
1
])
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
1
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
target
=
[
"unreasonable"
,
"reasonable"
][
int
(
doc
[
0
])]
target
=
[
"unreasonable"
,
"reasonable"
][
int
(
doc
[
0
])]
return
" {}"
.
format
(
target
)
return
" {}"
.
format
(
target
)
...
@@ -253,6 +271,12 @@ class EthicsUtilitarianismOriginal(Ethics):
...
@@ -253,6 +271,12 @@ class EthicsUtilitarianismOriginal(Ethics):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
'Activity: "{}"
\n
Rating:'
.
format
(
doc
[
"activity"
])
return
'Activity: "{}"
\n
Rating:'
.
format
(
doc
[
"activity"
])
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"activity"
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" "
+
doc
[
"rating"
]
return
" "
+
doc
[
"rating"
]
...
...
lm_eval/tasks/hendrycks_math.py
View file @
dae7b868
...
@@ -58,6 +58,12 @@ class Math(Task):
...
@@ -58,6 +58,12 @@ class Math(Task):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"Problem: "
+
doc
[
"problem"
]
+
"
\n
Answer:"
return
"Problem: "
+
doc
[
"problem"
]
+
"
\n
Answer:"
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"problem"
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" "
+
doc
[
"answer"
]
return
" "
+
doc
[
"answer"
]
...
...
lm_eval/tasks/hendrycks_test.py
View file @
dae7b868
...
@@ -116,3 +116,10 @@ class GeneralHendrycksTest(MultipleChoiceTask):
...
@@ -116,3 +116,10 @@ class GeneralHendrycksTest(MultipleChoiceTask):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
return
doc
[
"query"
]
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"query"
]
lm_eval/tasks/lambada_cloze.py
View file @
dae7b868
...
@@ -11,5 +11,11 @@ class LAMBADA_cloze(LAMBADA):
...
@@ -11,5 +11,11 @@ class LAMBADA_cloze(LAMBADA):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
doc
[
'text'
].
rsplit
(
' '
,
1
)[
0
]
+
" ____. ->"
return
doc
[
'text'
].
rsplit
(
' '
,
1
)[
0
]
+
" ____. ->"
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
'text'
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" "
+
doc
[
'text'
].
rsplit
(
' '
,
1
)[
1
]
return
" "
+
doc
[
'text'
].
rsplit
(
' '
,
1
)[
1
]
lm_eval/tasks/mc_taco.py
View file @
dae7b868
...
@@ -43,6 +43,12 @@ class MCTACO(HFTask):
...
@@ -43,6 +43,12 @@ class MCTACO(HFTask):
return
f
"
{
doc
[
'sentence'
]
}
\n
Question:
{
doc
[
'question'
]
}
\n
"
\
return
f
"
{
doc
[
'sentence'
]
}
\n
Question:
{
doc
[
'question'
]
}
\n
"
\
f
"Answer:
{
doc
[
'answer'
]
}
\n
Plausible:"
f
"Answer:
{
doc
[
'answer'
]
}
\n
Plausible:"
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
'question'
]
+
" "
+
doc
[
'sentence'
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" "
+
[
"no"
,
"yes"
][
doc
[
'label'
]]
return
" "
+
[
"no"
,
"yes"
][
doc
[
'label'
]]
...
...
lm_eval/tasks/mutual.py
View file @
dae7b868
...
@@ -73,6 +73,12 @@ class MuTualBase(Task):
...
@@ -73,6 +73,12 @@ class MuTualBase(Task):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
self
.
detokenize
(
doc
[
"article"
])
return
self
.
detokenize
(
doc
[
"article"
])
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"article"
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" "
+
self
.
detokenize
(
doc
[
"options"
][
self
.
CHOICES
.
index
(
doc
[
"answers"
])])
return
" "
+
self
.
detokenize
(
doc
[
"options"
][
self
.
CHOICES
.
index
(
doc
[
"answers"
])])
...
...
lm_eval/tasks/naturalqs.py
View file @
dae7b868
...
@@ -36,6 +36,12 @@ class NaturalQs(HFTask):
...
@@ -36,6 +36,12 @@ class NaturalQs(HFTask):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
'Q: '
+
doc
[
'question'
][
'text'
]
+
'
\n\n
'
+
'A:'
return
'Q: '
+
doc
[
'question'
][
'text'
]
+
'
\n\n
'
+
'A:'
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
'question'
][
'text'
]
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
# There's a short answer and a long answer. Based on the paper, I'm using the long answer.
# There's a short answer and a long answer. Based on the paper, I'm using the long answer.
short_answer
=
doc
[
'annotations'
][
'short_answers'
][
0
][
'text'
]
short_answer
=
doc
[
'annotations'
][
'short_answers'
][
0
][
'text'
]
...
...
lm_eval/tasks/openbookqa.py
View file @
dae7b868
...
@@ -27,3 +27,10 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
...
@@ -27,3 +27,10 @@ class OpenBookQA(HFTask, MultipleChoiceTask):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
return
doc
[
"query"
]
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"query"
]
lm_eval/tasks/prost.py
View file @
dae7b868
...
@@ -55,3 +55,10 @@ class PROST(HFTask, MultipleChoiceTask):
...
@@ -55,3 +55,10 @@ class PROST(HFTask, MultipleChoiceTask):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
return
doc
[
"query"
]
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"query"
]
lm_eval/tasks/qa4mre.py
View file @
dae7b868
...
@@ -73,6 +73,13 @@ class QA4MRE(MultipleChoiceTask):
...
@@ -73,6 +73,13 @@ class QA4MRE(MultipleChoiceTask):
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
Question: {}
\n
Answer:"
.
format
(
doc
[
"source"
],
doc
[
"query"
])
return
"{}
\n
Question: {}
\n
Answer:"
.
format
(
doc
[
"source"
],
doc
[
"query"
])
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"source"
]
+
" "
+
doc
[
"query"
]
class
QA4MRE_2011
(
QA4MRE
):
class
QA4MRE_2011
(
QA4MRE
):
YEAR
=
2011
YEAR
=
2011
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment