Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2a1d7d87
Commit
2a1d7d87
authored
Feb 04, 2021
by
Leo Gao
Browse files
Merge branch 'master' of github.com:EleutherAI/lm_evaluation_harness
parents
b1f7284e
a55a5c52
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
50 additions
and
15 deletions
+50
-15
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+2
-2
lm_eval/tasks/arc.py
lm_eval/tasks/arc.py
+45
-10
lm_eval/tasks/arithmetic.py
lm_eval/tasks/arithmetic.py
+3
-3
No files found.
lm_eval/tasks/__init__.py
View file @
2a1d7d87
...
...
@@ -47,8 +47,8 @@ TASK_REGISTRY = {
"piqa"
:
piqa
.
PiQA
,
#"triviaqa": triviaqa.TriviaQA,
#
"arc_easy": arc.ARCEasy,
# not implemented yet
#
"arc_challenge": arc.ARCChallenge,
# not implemented yet
"arc_easy"
:
arc
.
ARCEasy
,
"arc_challenge"
:
arc
.
ARCChallenge
,
# "quac": quac.QuAC, # not implemented yet
"hellaswag"
:
hellaswag
.
HellaSwag
,
# not implemented yet
# "openbookqa": openbookqa.OpenBookQA, # not implemented yet
...
...
lm_eval/tasks/arc.py
View file @
2a1d7d87
import
numpy
as
np
from
lm_eval.base
import
rf
,
mean
from
.
common
import
HFTask
class
ARCEasy
(
HFTask
):
DATASET_PATH
=
"ai2_arc"
DATASET_NAME
=
"ARC-Easy"
letter_to_num
=
{
'A'
:
0
,
'B'
:
1
,
'C'
:
2
,
'D'
:
3
,
'E'
:
4
}
def
__init__
(
self
):
super
().
__init__
()
self
.
data
=
self
.
__clean_data
()
def
__clean_data
(
self
):
""" Resolves various edge cases in the unprocessed HF ARC dataset. """
# NOTE: Some `doc["answerKey"]`s are in numeric string format being one
# of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter
=
{
'1'
:
'A'
,
'2'
:
'B'
,
'3'
:
'C'
,
'4'
:
'D'
,
'5'
:
'E'
}
result
=
{}
for
split
,
data
in
self
.
data
.
items
():
result
[
split
]
=
[]
for
doc
in
data
:
# Ensure all `answerKey`s and `label`s are in letter format.
doc
[
"answerKey"
]
=
num_to_letter
.
get
(
doc
[
"answerKey"
],
doc
[
"answerKey"
])
doc
[
"choices"
][
"label"
]
=
[
num_to_letter
.
get
(
label
,
label
)
for
label
in
doc
[
"choices"
][
"label"
]
]
result
[
split
].
append
(
doc
)
return
result
def
has_training_docs
(
self
):
return
True
...
...
@@ -21,7 +47,8 @@ class ARCEasy(HFTask):
return
"Question: "
+
doc
[
'question'
]
+
'
\n
Answer:'
def
doc_to_target
(
self
,
doc
):
return
" "
+
doc
[
'choices'
][
'text'
][
doc
[
'choices'
][
'label'
].
index
(
doc
[
'answerKey'
])]
index
=
self
.
letter_to_num
[
doc
[
"answerKey"
]]
return
" "
+
doc
[
'choices'
][
'text'
][
index
]
def
construct_requests
(
self
,
doc
,
ctx
):
""" Uses RequestFactory to construct Requests and returns an iterable of
...
...
@@ -34,9 +61,11 @@ class ARCEasy(HFTask):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
ll_choices
=
[]
for
choice
in
doc
[
"choices"
][
"text"
]:
ll_choices
.
append
(
rf
.
loglikelihood
(
ctx
,
" "
+
choice
)[
0
])
return
ll_choices
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
...
...
@@ -47,8 +76,11 @@ class ARCEasy(HFTask):
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
gold
=
self
.
letter_to_num
[
doc
[
"answerKey"
]]
pred
=
np
.
argmax
(
results
)
return
{
"acc"
:
pred
==
gold
}
def
aggregation
(
self
):
"""
...
...
@@ -56,8 +88,9 @@ class ARCEasy(HFTask):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
return
{
"acc"
:
mean
}
def
higher_is_better
(
self
):
"""
...
...
@@ -65,8 +98,10 @@ class ARCEasy(HFTask):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
return
{
"acc"
:
True
}
class
ARCChallenge
(
ARCEasy
):
DATASET_PATH
=
"ai2_arc"
...
...
lm_eval/tasks/arithmetic.py
View file @
2a1d7d87
...
...
@@ -32,7 +32,7 @@ class Arithmetic(Task):
self
.
_docs
=
[
self
.
load_doc
(
json
.
loads
(
line
))
for
line
in
jsons
]
def
has_training_docs
(
self
):
return
Tru
e
return
Fals
e
def
has_validation_docs
(
self
):
return
True
...
...
@@ -41,10 +41,10 @@ class Arithmetic(Task):
return
False
def
training_docs
(
self
):
return
self
.
_docs
return
NotImplemented
def
validation_docs
(
self
):
return
self
.
_docs
[:
100
]
return
self
.
_docs
def
test_docs
(
self
):
return
NotImplemented
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment