Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e26dc4d3
Unverified
Commit
e26dc4d3
authored
Feb 11, 2021
by
Leo Gao
Committed by
GitHub
Feb 11, 2021
Browse files
Merge pull request #144 from jon-tow/arc-refactor
Refactor `ARC` as a `MultipleChoiceTask`
parents
f7992789
24ac76df
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
32 additions
and
83 deletions
+32
-83
lm_eval/tasks/arc.py
lm_eval/tasks/arc.py
+32
-83
No files found.
lm_eval/tasks/arc.py
View file @
e26dc4d3
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
.
common
import
HFTask
from
lm_eval.base
import
MultipleChoiceTask
from
.common
import
HFTask
class
ARCEasy
(
HFTask
):
class
ARCEasy
(
HFTask
,
MultipleChoiceTask
):
DATASET_PATH
=
"ai2_arc"
DATASET_NAME
=
"ARC-Easy"
letter_to_num
=
{
'A'
:
0
,
'B'
:
1
,
'C'
:
2
,
'D'
:
3
,
'E'
:
4
}
def
__init__
(
self
):
super
().
__init__
()
self
.
data
=
self
.
__clean_data
()
def
__clean_data
(
self
):
""" Resolves various edge cases in the unprocessed HF ARC dataset. """
# NOTE: Some `doc["answerKey"]`s are in numeric string format being one
# of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter
=
{
'1'
:
'A'
,
'2'
:
'B'
,
'3'
:
'C'
,
'4'
:
'D'
,
'5'
:
'E'
}
result
=
{}
for
split
,
data
in
self
.
data
.
items
():
result
[
split
]
=
[]
for
doc
in
data
:
# Ensure all `answerKey`s and `label`s are in letter format.
doc
[
"answerKey"
]
=
num_to_letter
.
get
(
doc
[
"answerKey"
],
doc
[
"answerKey"
])
doc
[
"choices"
][
"label"
]
=
[
num_to_letter
.
get
(
label
,
label
)
for
label
in
doc
[
"choices"
][
"label"
]
]
result
[
split
].
append
(
doc
)
return
result
def
has_training_docs
(
self
):
return
True
...
...
@@ -39,68 +15,41 @@ class ARCEasy(HFTask):
def
has_test_docs
(
self
):
return
True
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
doc_to_text
(
self
,
doc
):
return
"Question: "
+
doc
[
'question'
]
+
'
\n
Answer:'
def
doc_to_target
(
self
,
doc
):
index
=
self
.
letter_to_num
[
doc
[
"answerKey"
]]
return
" "
+
doc
[
'choices'
][
'text'
][
index
]
def
_convert_standard
(
self
,
doc
):
# NOTE: Some `doc["answerKey"]`s are in numeric string format being one
# of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter
=
{
"1"
:
"A"
,
"2"
:
"B"
,
"3"
:
"C"
,
"4"
:
"D"
,
"5"
:
"E"
}
doc
[
"answerKey"
]
=
num_to_letter
.
get
(
doc
[
"answerKey"
],
doc
[
"answerKey"
])
out_doc
=
{
"id"
:
doc
[
"id"
],
"query"
:
"Question: "
+
doc
[
"question"
]
+
"
\n
Answer:"
,
"choices"
:
doc
[
"choices"
][
"text"
],
"gold"
:
[
"A"
,
"B"
,
"C"
,
"D"
,
"E"
].
index
(
doc
[
"answerKey"
]),
}
return
out_doc
def
construct_request
s
(
self
,
doc
,
ctx
):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
def
_load_doc
s
(
self
,
doc
s
):
for
record
in
docs
:
yield
self
.
_convert_standard
(
record
)
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_choices
=
[]
for
choice
in
doc
[
"choices"
][
"text"
]:
ll_choices
.
append
(
rf
.
loglikelihood
(
ctx
,
" "
+
choice
)[
0
])
return
ll_choices
def
training_docs
(
self
):
docs
=
super
().
training_docs
()
return
self
.
_load_docs
(
docs
)
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
def
validation_docs
(
self
):
docs
=
super
().
validation_docs
()
return
self
.
_load_docs
(
docs
)
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold
=
self
.
letter_to_num
[
doc
[
"answerKey"
]]
pred
=
np
.
argmax
(
results
)
return
{
"acc"
:
pred
==
gold
}
def
test_docs
(
self
):
docs
=
super
().
test_docs
()
return
self
.
_load_docs
(
docs
)
def
aggregation
(
self
):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"acc"
:
mean
}
def
fewshot_description
(
self
):
# TODO: figure out description
return
""
def
higher_is_better
(
self
):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"acc"
:
True
}
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
class
ARCChallenge
(
ARCEasy
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment