Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
c6c67272
Commit
c6c67272
authored
Feb 12, 2021
by
Anthony DiPofi
Browse files
refactor MathQA as MultipleChoiceTask, uses full words for Q. and A.
parent
742b5df2
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
53 deletions
+28
-53
lm_eval/tasks/headqa.py
lm_eval/tasks/headqa.py
+1
-1
lm_eval/tasks/mathqa.py
lm_eval/tasks/mathqa.py
+27
-52
No files found.
lm_eval/tasks/headqa.py
View file @
c6c67272
...
@@ -19,7 +19,7 @@ class HeadQA(HFTask):
...
@@ -19,7 +19,7 @@ class HeadQA(HFTask):
return
""
return
""
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"Q: "
+
doc
[
'qtext'
]
+
'
\n
A:'
return
"Q
uestion
: "
+
doc
[
'qtext'
]
+
'
\n
A
nswer
:'
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
# this picks one answer to be the "correct" one, despite sometimes
# this picks one answer to be the "correct" one, despite sometimes
...
...
lm_eval/tasks/mathqa.py
View file @
c6c67272
from
.
common
import
HFTask
from
.
common
import
HFTask
from
lm_eval.base
import
mean
,
rf
from
lm_eval.base
import
mean
,
rf
,
MultipleChoiceTask
class
MathQA
(
HFTask
):
class
MathQA
(
HFTask
,
MultipleChoiceTask
):
DATASET_PATH
=
"math_qa"
DATASET_PATH
=
"math_qa"
DATASET_NAME
=
None
DATASET_NAME
=
None
...
@@ -14,60 +15,34 @@ class MathQA(HFTask):
...
@@ -14,60 +15,34 @@ class MathQA(HFTask):
def
has_test_docs
(
self
):
def
has_test_docs
(
self
):
return
True
return
True
def
fewshot_description
(
self
):
def
_convert_standard
(
self
,
doc
):
# TODO: figure out description
return
""
def
doc_to_text
(
self
,
doc
):
return
"Q: "
+
doc
[
'Problem'
]
+
'
\n
A:'
def
doc_to_target
(
self
,
doc
):
# this picks one answer to be the "correct" one, despite sometimes
# multiple correct answers being possible.
# TODO: make sure we're actually handling multi-answer correctly
return
" "
+
doc
[
'correct'
]
def
_remove_prefixes
(
self
,
aliases
):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list
# we can do this because if the prefix is acceptable by isgreedy, we can stop looking
aliases
.
sort
()
ret
=
[
aliases
[
0
]]
for
alias
in
aliases
[
1
:]:
if
not
alias
.
startswith
(
ret
[
-
1
]):
ret
.
append
(
alias
)
return
ret
def
construct_requests
(
self
,
doc
,
ctx
):
out_doc
=
{
"query"
:
"Question: "
+
doc
[
'Problem'
]
+
" "
+
doc
[
"options"
]
+
"
\n
Answer:"
,
self
.
answer_options
=
[
'a'
,
'b'
,
'c'
,
'd'
,
'e'
]
"choices"
:
[
'a'
,
'b'
,
'c'
,
'd'
,
'e'
],
"gold"
:
[
'a'
,
'b'
,
'c'
,
'd'
,
'e'
].
index
(
doc
[
'correct'
]),
}
return
out_doc
ret
=
[]
def
_load_docs
(
self
,
docs
):
for
i
in
range
(
len
(
self
.
answer_options
)):
for
record
in
docs
:
ll
,
_
=
rf
.
loglikelihood
(
ctx
,
' '
+
self
.
answer_options
[
i
])
yield
self
.
_convert_standard
(
record
)
ret
.
append
(
ll
)
return
ret
def
training_docs
(
self
):
docs
=
super
().
training_docs
()
return
self
.
_load_docs
(
docs
)
def
process_results
(
self
,
doc
,
results
):
def
validation_docs
(
self
):
max_result_idx
=
max
(
enumerate
(
results
),
key
=
lambda
x
:
x
[
1
])[
0
]
docs
=
super
().
validation_docs
()
return
self
.
_load_docs
(
docs
)
if
doc
[
'correct'
]
==
self
.
answer_options
[
max_result_idx
]:
def
test_docs
(
self
):
result
=
1.0
docs
=
super
().
test_docs
()
else
:
return
self
.
_load_docs
(
docs
)
result
=
0.0
return
{
def
fewshot_description
(
self
):
"acc"
:
result
# TODO: figure out description
}
return
""
def
aggregation
(
self
):
return
{
"acc"
:
mean
,
}
def
higher_is_better
(
self
):
def
doc_to_text
(
self
,
doc
):
return
{
return
doc
[
"query"
]
"acc"
:
True
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment