Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
1f8a8c1d
Commit
1f8a8c1d
authored
Jun 11, 2022
by
jon-tow
Browse files
Merge branch 'master' of
https://github.com/EleutherAI/lm-evaluation-harness
into remove-dataset
parents
b4c0275d
b0acb337
Changes
255
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1018 additions
and
750 deletions
+1018
-750
lm_eval/tasks/quac.py
lm_eval/tasks/quac.py
+123
-106
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+53
-42
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+13
-5
lm_eval/tasks/sciq.py
lm_eval/tasks/sciq.py
+8
-2
lm_eval/tasks/squad.py
lm_eval/tasks/squad.py
+219
-163
lm_eval/tasks/storycloze.py
lm_eval/tasks/storycloze.py
+27
-21
lm_eval/tasks/superglue.py
lm_eval/tasks/superglue.py
+72
-98
lm_eval/tasks/translation.py
lm_eval/tasks/translation.py
+28
-9
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+12
-10
lm_eval/tasks/truthfulqa.py
lm_eval/tasks/truthfulqa.py
+63
-69
lm_eval/tasks/unscramble.py
lm_eval/tasks/unscramble.py
+9
-9
lm_eval/tasks/webqs.py
lm_eval/tasks/webqs.py
+13
-11
lm_eval/tasks/wikitext.py
lm_eval/tasks/wikitext.py
+5
-2
lm_eval/tasks/winogrande.py
lm_eval/tasks/winogrande.py
+132
-132
lm_eval/tasks/wsc273.py
lm_eval/tasks/wsc273.py
+24
-13
lm_eval/utils.py
lm_eval/utils.py
+47
-32
main.py
main.py
+54
-19
pile_statistics.json
pile_statistics.json
+37
-0
scripts/clean_training_data/README.md
scripts/clean_training_data/README.md
+6
-7
scripts/clean_training_data/compress_and_package.py
scripts/clean_training_data/compress_and_package.py
+73
-0
No files found.
lm_eval/tasks/quac.py
View file @
1f8a8c1d
...
...
@@ -51,17 +51,34 @@ class QuAC(Task):
raise
NotImplementedError
(
"QuAC has no test docs."
)
def
_process_doc
(
self
,
doc
):
doc
[
"title"
]
=
doc
[
'
title
'
]
+
'
-
'
+
doc
[
'
section_title
'
]
doc
[
"title"
]
=
doc
[
"
title
"
]
+
"
-
"
+
doc
[
"
section_title
"
]
return
doc
def
doc_to_text
(
self
,
doc
):
return
'TITLE: '
+
doc
[
'title'
]
+
'
\n
'
+
'PARAGRAPH: '
+
doc
[
'paragraph'
]
+
'
\n\n
'
+
'Q: '
+
doc
[
'question'
]
+
'
\n\n
'
+
'A: '
return
(
"TITLE: "
+
doc
[
"title"
]
+
"
\n
"
+
"PARAGRAPH: "
+
doc
[
"paragraph"
]
+
"
\n\n
"
+
"Q: "
+
doc
[
"question"
]
+
"
\n\n
"
+
"A: "
)
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"paragraph"
]
def
doc_to_target
(
self
,
doc
):
return
doc
[
'
answer
'
]
return
doc
[
"
answer
"
]
def
construct_requests
(
self
,
doc
,
ctx
):
"""
Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
...
...
@@ -72,7 +89,7 @@ class QuAC(Task):
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'
Evaluation not implemented
'
)
raise
NotImplementedError
(
"
Evaluation not implemented
"
)
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
...
...
@@ -85,7 +102,7 @@ class QuAC(Task):
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'
Evaluation not implemented
'
)
raise
NotImplementedError
(
"
Evaluation not implemented
"
)
def
aggregation
(
self
):
"""
...
...
@@ -94,7 +111,7 @@ class QuAC(Task):
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'
Evaluation not implemented
'
)
raise
NotImplementedError
(
"
Evaluation not implemented
"
)
def
higher_is_better
(
self
):
"""
...
...
@@ -103,4 +120,4 @@ class QuAC(Task):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'
Evaluation not implemented
'
)
raise
NotImplementedError
(
"
Evaluation not implemented
"
)
lm_eval/tasks/race.py
View file @
1f8a8c1d
...
...
@@ -40,7 +40,7 @@ class RACE(Task):
DATASET_NAME
=
"high"
cache
=
{}
letter_to_num
=
{
'A'
:
0
,
'B'
:
1
,
'C'
:
2
,
'D'
:
3
}
letter_to_num
=
{
"A"
:
0
,
"B"
:
1
,
"C"
:
2
,
"D"
:
3
}
def
has_training_docs
(
self
):
return
True
...
...
@@ -59,17 +59,27 @@ class RACE(Task):
# is shown that one document is made per passage.
r
=
collections
.
defaultdict
(
list
)
for
item
in
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
name
=
self
.
DATASET_NAME
)[
set
]:
r
[
item
[
'article'
]].
append
(
item
)
res
=
list
(
r
.
values
()
>>
each
(
lambda
x
:
{
'article'
:
x
[
0
][
'article'
],
'problems'
:
x
>>
each
(
lambda
y
:
{
'question'
:
y
[
'question'
],
'answer'
:
y
[
'answer'
],
'options'
:
y
[
'options'
],
})
}))
for
item
in
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
name
=
self
.
DATASET_NAME
)[
set
]:
r
[
item
[
"article"
]].
append
(
item
)
res
=
list
(
r
.
values
()
>>
each
(
lambda
x
:
{
"article"
:
x
[
0
][
"article"
],
"problems"
:
x
>>
each
(
lambda
y
:
{
"question"
:
y
[
"question"
],
"answer"
:
y
[
"answer"
],
"options"
:
y
[
"options"
],
}
),
}
)
)
self
.
cache
[
set
]
=
res
return
res
...
...
@@ -85,30 +95,38 @@ class RACE(Task):
@
classmethod
def
get_answer_option
(
cls
,
problem
):
answer
=
cls
.
letter_to_num
[
problem
[
'
answer
'
]]
return
problem
[
'
options
'
][
answer
]
answer
=
cls
.
letter_to_num
[
problem
[
"
answer
"
]]
return
problem
[
"
options
"
][
answer
]
@
classmethod
def
last_problem
(
cls
,
doc
):
return
doc
[
'
problems
'
][
-
1
]
return
doc
[
"
problems
"
][
-
1
]
def
doc_to_text
(
self
,
doc
):
text
=
'Article: '
+
doc
[
'article'
]
+
'
\n\n
'
for
problem
in
doc
[
'problems'
][:
-
1
]:
if
problem
[
'question'
][
-
6
:]
==
' _ .'
:
text
+=
problem
[
'question'
][
-
5
:]
+
self
.
get_answer_option
(
problem
)
+
'
\n
'
text
=
"Article: "
+
doc
[
"article"
]
+
"
\n\n
"
for
problem
in
doc
[
"problems"
][:
-
1
]:
if
problem
[
"question"
][
-
6
:]
==
" _ ."
:
text
+=
(
problem
[
"question"
][
-
5
:]
+
self
.
get_answer_option
(
problem
)
+
"
\n
"
)
else
:
question
=
'
Question:
'
+
problem
[
'
question
'
]
+
'
\n
'
answer
=
'
Answer:
'
+
self
.
get_answer_option
(
problem
)
+
'
\n
'
question
=
"
Question:
"
+
problem
[
"
question
"
]
+
"
\n
"
answer
=
"
Answer:
"
+
self
.
get_answer_option
(
problem
)
+
"
\n
"
text
+=
question
+
answer
text
+=
self
.
last_problem
(
doc
)[
'
question
'
]
text
+=
self
.
last_problem
(
doc
)[
"
question
"
]
return
text
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"article"
]
def
doc_to_target
(
self
,
doc
):
return
" "
+
self
.
get_answer_option
(
self
.
last_problem
(
doc
))
def
construct_requests
(
self
,
doc
,
ctx
):
"""
Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
...
...
@@ -120,8 +138,7 @@ class RACE(Task):
"""
problem
=
self
.
last_problem
(
doc
)
ll_choices
=
[
rf
.
loglikelihood
(
ctx
,
" "
+
problem
[
'options'
][
i
])[
0
]
for
i
in
range
(
4
)
rf
.
loglikelihood
(
ctx
,
" "
+
problem
[
"options"
][
i
])[
0
]
for
i
in
range
(
4
)
]
return
ll_choices
...
...
@@ -135,11 +152,9 @@ class RACE(Task):
:param results:
The results of the requests created in construct_requests.
"""
gold
=
self
.
letter_to_num
[
self
.
last_problem
(
doc
)[
'
answer
'
]]
gold
=
self
.
letter_to_num
[
self
.
last_problem
(
doc
)[
"
answer
"
]]
pred
=
np
.
argmax
(
results
)
return
{
"acc"
:
int
(
pred
==
gold
)
}
return
{
"acc"
:
int
(
pred
==
gold
)}
def
aggregation
(
self
):
"""
...
...
@@ -147,9 +162,7 @@ class RACE(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"acc"
:
mean
}
return
{
"acc"
:
mean
}
def
higher_is_better
(
self
):
"""
...
...
@@ -157,6 +170,4 @@ class RACE(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
lm_eval/tasks/sat.py
View file @
1f8a8c1d
...
...
@@ -59,11 +59,19 @@ class SATAnalogies(MultipleChoiceTask):
def
_process_doc
(
self
,
doc
):
return
{
'source'
:
doc
[
'source'
],
'query'
:
doc
[
'stem'
].
split
(
' '
)[:
2
],
'choices'
:
[
"{} is to {}"
.
format
(
*
c
.
split
(
' '
)[:
2
])
for
c
in
doc
[
"choices"
]],
'gold'
:
[
'a'
,
'b'
,
'c'
,
'd'
,
'e'
].
index
(
doc
[
'solution'
].
strip
()),
"source"
:
doc
[
"source"
],
"query"
:
doc
[
"stem"
].
split
(
" "
)[:
2
],
"choices"
:
[
"{} is to {}"
.
format
(
*
c
.
split
(
" "
)[:
2
])
for
c
in
doc
[
"choices"
]
],
"gold"
:
[
"a"
,
"b"
,
"c"
,
"d"
,
"e"
].
index
(
doc
[
"solution"
].
strip
()),
}
def
doc_to_text
(
self
,
doc
):
return
"{} is to {} as"
.
format
(
*
doc
[
'query'
])
return
"{} is to {} as"
.
format
(
*
doc
[
"query"
])
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"source"
]
+
"
\n
"
+
" "
.
join
(
doc
[
"query"
])
lm_eval/tasks/sciq.py
View file @
1f8a8c1d
...
...
@@ -54,10 +54,10 @@ class SciQ(MultipleChoiceTask):
doc
[
"distractor3"
],
doc
[
"correct_answer"
],
]
src
=
doc
[
'
support
'
]
src
=
doc
[
"
support
"
]
out_doc
=
{
"source"
:
src
,
"query"
:
doc
[
'
question
'
],
"query"
:
doc
[
"
question
"
],
"choices"
:
choices
,
"gold"
:
3
,
}
...
...
@@ -65,3 +65,9 @@ class SciQ(MultipleChoiceTask):
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
Question: {}
\n
Answer:"
.
format
(
doc
[
"source"
],
doc
[
"query"
]).
strip
()
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"source"
]
+
" "
+
doc
[
"query"
]
lm_eval/tasks/squad.py
View file @
1f8a8c1d
...
...
@@ -49,7 +49,9 @@ class SQuAD2(Task):
DATASET_NAME
=
None
# HF changed squad on us so we have to make sure we aren't running the old one
assert
version
.
parse
(
datasets
.
__version__
)
>=
version
.
parse
(
"1.11.0"
),
"datasets v1.11.0 or later required for SQuAD"
assert
version
.
parse
(
datasets
.
__version__
)
>=
version
.
parse
(
"1.11.0"
),
"datasets v1.11.0 or later required for SQuAD"
def
has_training_docs
(
self
):
return
True
...
...
@@ -67,18 +69,35 @@ class SQuAD2(Task):
return
self
.
dataset
[
"validation"
]
def
doc_to_text
(
self
,
doc
):
return
'Title: '
+
doc
[
'title'
]
+
'
\n\n
'
+
'Background: '
+
doc
[
'context'
]
+
'
\n\n
'
+
'Question: '
+
doc
[
'question'
]
+
'
\n\n
'
+
'Answer:'
return
(
"Title: "
+
doc
[
"title"
]
+
"
\n\n
"
+
"Background: "
+
doc
[
"context"
]
+
"
\n\n
"
+
"Question: "
+
doc
[
"question"
]
+
"
\n\n
"
+
"Answer:"
)
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"context"
]
def
doc_to_target
(
self
,
doc
):
answer_list
=
doc
[
'
answers
'
][
'
text
'
]
answer_list
=
doc
[
"
answers
"
][
"
text
"
]
if
len
(
answer_list
)
>
0
:
answer
=
answer_list
[
0
]
else
:
answer
=
'
unanswerable
'
answer
=
"
unanswerable
"
return
" "
+
answer
def
construct_requests
(
self
,
doc
,
ctx
):
"""
Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
...
...
@@ -88,7 +107,7 @@ class SQuAD2(Task):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
continuation
=
rf
.
greedy_until
(
ctx
,
[
'
\n
'
])
continuation
=
rf
.
greedy_until
(
ctx
,
[
"
\n
"
])
is_unanswerable
=
rf
.
loglikelihood
(
ctx
,
" "
+
"unanswerable"
)
return
continuation
,
is_unanswerable
...
...
@@ -107,25 +126,46 @@ class SQuAD2(Task):
no_answer_probability
=
exp
(
logprob_unanswerable
)
predictions
=
{
'
id
'
:
doc
[
'
id
'
],
'
prediction_text
'
:
continuation
,
'
no_answer_probability
'
:
no_answer_probability
,
"
id
"
:
doc
[
"
id
"
],
"
prediction_text
"
:
continuation
,
"
no_answer_probability
"
:
no_answer_probability
,
}
references
=
{
'
id
'
:
doc
[
'
id
'
],
'
answers
'
:
doc
[
'
answers
'
],
"
id
"
:
doc
[
"
id
"
],
"
answers
"
:
doc
[
"
answers
"
],
}
return
{
'exact'
:
(
predictions
,
references
),
# Exact match (the normalized answer exactly match the gold answer)
'f1'
:
(
predictions
,
references
),
# The F-score of predicted tokens versus the gold answer
'HasAns_exact'
:
(
predictions
,
references
),
# Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1'
:
(
predictions
,
references
),
# The F-score of predicted tokens versus the gold answer
'NoAns_exact'
:
(
predictions
,
references
),
# Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1'
:
(
predictions
,
references
),
# The F-score of predicted tokens versus the gold answer
'best_exact'
:
(
predictions
,
references
),
# Best exact match (with varying threshold)
'best_f1'
:
(
predictions
,
references
),
# Best F1 (with varying threshold)
"exact"
:
(
predictions
,
references
,
),
# Exact match (the normalized answer exactly match the gold answer)
"f1"
:
(
predictions
,
references
,
),
# The F-score of predicted tokens versus the gold answer
"HasAns_exact"
:
(
predictions
,
references
,
),
# Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1"
:
(
predictions
,
references
,
),
# The F-score of predicted tokens versus the gold answer
"NoAns_exact"
:
(
predictions
,
references
,
),
# Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1"
:
(
predictions
,
references
,
),
# The F-score of predicted tokens versus the gold answer
"best_exact"
:
(
predictions
,
references
,
),
# Best exact match (with varying threshold)
"best_f1"
:
(
predictions
,
references
),
# Best F1 (with varying threshold)
}
def
aggregation
(
self
):
...
...
@@ -135,14 +175,30 @@ class SQuAD2(Task):
functions that aggregate a list of metrics
"""
return
{
'exact'
:
partial
(
_squad_agg
,
'exact'
),
# Exact match (the normalized answer exactly match the gold answer)
'f1'
:
partial
(
_squad_agg
,
'f1'
),
# The F-score of predicted tokens versus the gold answer
'HasAns_exact'
:
partial
(
_squad_agg
,
'HasAns_exact'
),
# Exact match (the normalized answer exactly match the gold answer)
'HasAns_f1'
:
partial
(
_squad_agg
,
'HasAns_f1'
),
# The F-score of predicted tokens versus the gold answer
'NoAns_exact'
:
partial
(
_squad_agg
,
'NoAns_exact'
),
# Exact match (the normalized answer exactly match the gold answer)
'NoAns_f1'
:
partial
(
_squad_agg
,
'NoAns_f1'
),
# The F-score of predicted tokens versus the gold answer
'best_exact'
:
partial
(
_squad_agg
,
'best_exact'
),
# Best exact match (with varying threshold)
'best_f1'
:
partial
(
_squad_agg
,
'best_f1'
),
# Best F1 (with varying threshold)
"exact"
:
partial
(
_squad_agg
,
"exact"
),
# Exact match (the normalized answer exactly match the gold answer)
"f1"
:
partial
(
_squad_agg
,
"f1"
),
# The F-score of predicted tokens versus the gold answer
"HasAns_exact"
:
partial
(
_squad_agg
,
"HasAns_exact"
),
# Exact match (the normalized answer exactly match the gold answer)
"HasAns_f1"
:
partial
(
_squad_agg
,
"HasAns_f1"
),
# The F-score of predicted tokens versus the gold answer
"NoAns_exact"
:
partial
(
_squad_agg
,
"NoAns_exact"
),
# Exact match (the normalized answer exactly match the gold answer)
"NoAns_f1"
:
partial
(
_squad_agg
,
"NoAns_f1"
),
# The F-score of predicted tokens versus the gold answer
"best_exact"
:
partial
(
_squad_agg
,
"best_exact"
),
# Best exact match (with varying threshold)
"best_f1"
:
partial
(
_squad_agg
,
"best_f1"
),
# Best F1 (with varying threshold)
}
def
higher_is_better
(
self
):
...
...
@@ -152,12 +208,12 @@ class SQuAD2(Task):
whether a higher value of the submetric is better
"""
return
{
'
exact
'
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
'
f1
'
:
True
,
#
The F-score of predicted tokens versus the gold answer
'
HasAns_exact
'
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
'
HasAns_f1
'
:
True
,
# The F-score of predicted tokens versus the gold answer
'
NoAns_exact
'
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
'
NoAns_f1
'
:
True
,
# The F-score of predicted tokens versus the gold answer
'
best_exact
'
:
True
,
# Best exact match (with varying threshold)
'
best_f1
'
:
True
,
# Best F1 (with varying threshold)
"
exact
"
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
"
f1
"
:
True
,
#
The F-score of predicted tokens versus the gold answer
"
HasAns_exact
"
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
"
HasAns_f1
"
:
True
,
# The F-score of predicted tokens versus the gold answer
"
NoAns_exact
"
:
True
,
# Exact match (the normalized answer exactly match the gold answer)
"
NoAns_f1
"
:
True
,
# The F-score of predicted tokens versus the gold answer
"
best_exact
"
:
True
,
# Best exact match (with varying threshold)
"
best_f1
"
:
True
,
# Best F1 (with varying threshold)
}
lm_eval/tasks/storycloze.py
View file @
1f8a8c1d
...
...
@@ -65,12 +65,27 @@ class StoryCloze(Task):
return
self
.
dataset
[
"test"
]
def
doc_to_text
(
self
,
doc
):
return
' '
.
join
([
return
" "
.
join
(
[
doc
[
"input_sentence_1"
],
doc
[
"input_sentence_2"
],
doc
[
"input_sentence_3"
],
doc
[
"input_sentence_4"
],
])
]
)
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
" "
.
join
(
[
doc
[
"input_sentence_1"
],
doc
[
"input_sentence_2"
],
doc
[
"input_sentence_3"
],
doc
[
"input_sentence_4"
],
]
)
def
doc_to_target
(
self
,
doc
):
clozes
=
[
doc
[
"sentence_quiz1"
],
doc
[
"sentence_quiz2"
]]
...
...
@@ -78,7 +93,7 @@ class StoryCloze(Task):
return
" "
+
clozes
[
doc
[
"answer_right_ending"
]
-
1
]
def
construct_requests
(
self
,
doc
,
ctx
):
"""
Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
...
...
@@ -89,10 +104,7 @@ class StoryCloze(Task):
part of the document for `doc`.
"""
clozes
=
[
doc
[
"sentence_quiz1"
],
doc
[
"sentence_quiz2"
]]
lls
=
[
rf
.
loglikelihood
(
ctx
,
" {}"
.
format
(
choice
))[
0
]
for
choice
in
clozes
]
lls
=
[
rf
.
loglikelihood
(
ctx
,
" {}"
.
format
(
choice
))[
0
]
for
choice
in
clozes
]
return
lls
def
process_results
(
self
,
doc
,
results
):
...
...
@@ -106,10 +118,8 @@ class StoryCloze(Task):
The results of the requests created in construct_requests.
"""
gold
=
doc
[
"answer_right_ending"
]
-
1
acc
=
1.
if
np
.
argmax
(
results
)
==
gold
else
0.
return
{
"acc"
:
acc
}
acc
=
1.0
if
np
.
argmax
(
results
)
==
gold
else
0.0
return
{
"acc"
:
acc
}
def
aggregation
(
self
):
"""
...
...
@@ -117,9 +127,7 @@ class StoryCloze(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"acc"
:
mean
}
return
{
"acc"
:
mean
}
def
higher_is_better
(
self
):
"""
...
...
@@ -127,9 +135,7 @@ class StoryCloze(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
class
StoryCloze2016
(
StoryCloze
):
...
...
lm_eval/tasks/superglue.py
View file @
1f8a8c1d
...
...
@@ -57,13 +57,19 @@ class BoolQ(Task):
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'passage'
]
}
\n
Question:
{
doc
[
'question'
]
}
?
\n
Answer:"
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"passage"
]
def
doc_to_target
(
self
,
doc
):
return
" "
+
yesno
(
doc
[
'
label
'
])
return
" "
+
yesno
(
doc
[
"
label
"
])
def
construct_requests
(
self
,
doc
,
ctx
):
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
'
yes
'
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
'
no
'
)
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
"
yes
"
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
"
no
"
)
return
ll_yes
,
ll_no
...
...
@@ -71,21 +77,15 @@ class BoolQ(Task):
ll_yes
,
ll_no
=
results
gold
=
doc
[
"label"
]
acc
=
1.
if
(
ll_yes
>
ll_no
)
==
gold
else
0.
acc
=
1.
0
if
(
ll_yes
>
ll_no
)
==
gold
else
0.
0
return
{
"acc"
:
acc
}
return
{
"acc"
:
acc
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
return
{
"acc"
:
mean
}
class
CommitmentBank
(
Task
):
...
...
@@ -123,27 +123,21 @@ class CommitmentBank(Task):
return
" {}"
.
format
({
0
:
"True"
,
1
:
"False"
,
2
:
"Neither"
}[
doc
[
"label"
]])
def
construct_requests
(
self
,
doc
,
ctx
):
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
'
True
'
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
'
False
'
)
ll_neither
,
_
=
rf
.
loglikelihood
(
ctx
,
'
Neither
'
)
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
"
True
"
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
"
False
"
)
ll_neither
,
_
=
rf
.
loglikelihood
(
ctx
,
"
Neither
"
)
return
ll_true
,
ll_false
,
ll_neither
def
process_results
(
self
,
doc
,
results
):
gold
=
doc
[
"label"
]
pred
=
np
.
argmax
(
results
)
acc
=
1.
if
pred
==
gold
else
0.
acc
=
1.
0
if
pred
==
gold
else
0.
0
return
{
"acc"
:
acc
,
"f1"
:
(
pred
,
gold
)
}
return
{
"acc"
:
acc
,
"f1"
:
(
pred
,
gold
)}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
,
"f1"
:
True
}
return
{
"acc"
:
True
,
"f1"
:
True
}
@
classmethod
def
cb_multi_fi
(
cls
,
items
):
...
...
@@ -210,21 +204,15 @@ class Copa(Task):
def
process_results
(
self
,
doc
,
results
):
gold
=
doc
[
"label"
]
pred
=
np
.
argmax
(
results
)
acc
=
1.
if
pred
==
gold
else
0.
acc
=
1.
0
if
pred
==
gold
else
0.
0
return
{
"acc"
:
acc
}
return
{
"acc"
:
acc
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
return
{
"acc"
:
mean
}
@
staticmethod
def
convert_choice
(
choice
):
...
...
@@ -268,27 +256,21 @@ class MultiRC(Task):
true_choice
=
self
.
format_answer
(
answer
=
doc
[
"answer"
],
label
=
True
)
false_choice
=
self
.
format_answer
(
answer
=
doc
[
"answer"
],
label
=
False
)
ll_true_choice
,
_
=
rf
.
loglikelihood
(
ctx
,
f
'
{
true_choice
}
'
)
ll_false_choice
,
_
=
rf
.
loglikelihood
(
ctx
,
f
'
{
false_choice
}
'
)
ll_true_choice
,
_
=
rf
.
loglikelihood
(
ctx
,
f
"
{
true_choice
}
"
)
ll_false_choice
,
_
=
rf
.
loglikelihood
(
ctx
,
f
"
{
false_choice
}
"
)
return
ll_true_choice
,
ll_false_choice
def
process_results
(
self
,
doc
,
results
):
ll_true_choice
,
ll_false_choice
=
results
pred
=
ll_true_choice
>
ll_false_choice
return
{
"acc"
:
(
pred
,
doc
)
}
return
{
"acc"
:
(
pred
,
doc
)}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
acc_all
}
return
{
"acc"
:
acc_all
}
class
ReCoRD
(
Task
):
...
...
@@ -337,7 +319,7 @@ class ReCoRD(Task):
@
classmethod
def
format_answer
(
cls
,
query
,
entity
):
return
f
'
-
{
query
}
'
.
replace
(
"@placeholder"
,
entity
)
return
f
"
-
{
query
}
"
.
replace
(
"@placeholder"
,
entity
)
def
doc_to_target
(
self
,
doc
):
# We only output the first correct entity in a doc
...
...
@@ -359,8 +341,12 @@ class ReCoRD(Task):
prediction
=
doc
[
"entities"
][
max_idx
]
gold_label_set
=
doc
[
"answers"
]
f1
=
metric_max_over_ground_truths
(
squad_metrics
.
compute_f1
,
prediction
,
gold_label_set
)
em
=
metric_max_over_ground_truths
(
squad_metrics
.
compute_exact
,
prediction
,
gold_label_set
)
f1
=
metric_max_over_ground_truths
(
squad_metrics
.
compute_f1
,
prediction
,
gold_label_set
)
em
=
metric_max_over_ground_truths
(
squad_metrics
.
compute_exact
,
prediction
,
gold_label_set
)
return
{
"f1"
:
f1
,
...
...
@@ -403,19 +389,21 @@ class WordsInContext(Task):
return
self
.
dataset
[
"validation"
]
def
doc_to_text
(
self
,
doc
):
return
"Sentence 1: {}
\n
Sentence 2: {}
\n
Question: Is the word '{}' used in the same way in the"
\
return
(
"Sentence 1: {}
\n
Sentence 2: {}
\n
Question: Is the word '{}' used in the same way in the"
" two sentences above?
\n
Answer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence2"
],
doc
[
"sentence1"
][
doc
[
"start1"
]:
doc
[
"end1"
]],
doc
[
"sentence1"
][
doc
[
"start1"
]
:
doc
[
"end1"
]],
)
)
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
({
0
:
"no"
,
1
:
"yes"
}[
doc
[
"label"
]])
def
construct_requests
(
self
,
doc
,
ctx
):
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
'
yes
'
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
'
no
'
)
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
"
yes
"
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
"
no
"
)
return
ll_yes
,
ll_no
...
...
@@ -423,21 +411,15 @@ class WordsInContext(Task):
ll_yes
,
ll_no
=
results
gold
=
doc
[
"label"
]
acc
=
1.
if
(
ll_yes
>
ll_no
)
==
gold
else
0.
acc
=
1.
0
if
(
ll_yes
>
ll_no
)
==
gold
else
0.
0
return
{
"acc"
:
acc
}
return
{
"acc"
:
acc
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
return
{
"acc"
:
mean
}
class
SGWinogradSchemaChallenge
(
Task
):
...
...
@@ -461,9 +443,7 @@ class SGWinogradSchemaChallenge(Task):
if
self
.
_training_docs
is
None
:
# GPT-3 Paper's format only uses positive examples for fewshot "training"
self
.
_training_docs
=
[
doc
for
doc
in
self
.
dataset
[
"train"
]
if
doc
[
"label"
]
doc
for
doc
in
self
.
dataset
[
"train"
]
if
doc
[
"label"
]
]
return
self
.
_training_docs
...
...
@@ -473,25 +453,25 @@ class SGWinogradSchemaChallenge(Task):
def
doc_to_text
(
self
,
doc
):
raw_passage
=
doc
[
"text"
]
# NOTE: HuggingFace span indices are word-based not character-based.
pre
=
" "
.
join
(
raw_passage
.
split
()[:
doc
[
"span2_index"
]])
post
=
raw_passage
[
len
(
pre
)
+
len
(
doc
[
"span2_text"
])
+
1
:]
passage
=
general_detokenize
(
pre
+
" *{}*"
.
format
(
doc
[
'
span2_text
'
])
+
post
)
pre
=
" "
.
join
(
raw_passage
.
split
()[:
doc
[
"span2_index"
]])
post
=
raw_passage
[
len
(
pre
)
+
len
(
doc
[
"span2_text"
])
+
1
:]
passage
=
general_detokenize
(
pre
+
" *{}*"
.
format
(
doc
[
"
span2_text
"
])
+
post
)
noun
=
doc
[
"span1_text"
]
pronoun
=
doc
[
"span2_text"
]
text
=
(
f
"Passage:
{
passage
}
\n
"
+
f
"
Question: In the passage above, does the pronoun
\
"
*
{
pronoun
}
*
\
"
refer to
\
"
*
{
noun
}
*
\
"
?
\n
"
+
f
'
Question: In the passage above, does the pronoun "*
{
pronoun
}
*" refer to "*
{
noun
}
*"?
\n
'
+
"Answer:"
)
return
text
def
doc_to_target
(
self
,
doc
):
return
" "
+
yesno
(
doc
[
'
label
'
])
return
" "
+
yesno
(
doc
[
"
label
"
])
def
construct_requests
(
self
,
doc
,
ctx
):
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
'
yes
'
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
'
no
'
)
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
"
yes
"
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
"
no
"
)
return
ll_yes
,
ll_no
...
...
@@ -499,18 +479,12 @@ class SGWinogradSchemaChallenge(Task):
ll_yes
,
ll_no
=
results
gold
=
doc
[
"label"
]
acc
=
1.
if
(
ll_yes
>
ll_no
)
==
gold
else
0.
acc
=
1.
0
if
(
ll_yes
>
ll_no
)
==
gold
else
0.
0
return
{
"acc"
:
acc
}
return
{
"acc"
:
acc
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
return
{
"acc"
:
mean
}
lm_eval/tasks/translation.py
View file @
1f8a8c1d
...
...
@@ -41,44 +41,57 @@ def create_tasks_from_benchmarks(benchmark_dict):
:return: {task_name: task}
e.g. {wmt14-fr-en: Task, wmt16-de-en: Task}
"""
def
version_of
(
dataset
,
language_pair
):
if
language_pair
[
-
2
:]
in
[
"zh"
,
"ja"
]:
return
1
# changed to use jieba/nagisa
return
0
return
{
f
"
{
dataset
}
-
{
language_pair
}
"
:
create_translation_task
(
dataset
,
language_pair
,
version_of
(
dataset
,
language_pair
))
f
"
{
dataset
}
-
{
language_pair
}
"
:
create_translation_task
(
dataset
,
language_pair
,
version_of
(
dataset
,
language_pair
)
)
for
dataset
,
language_pairs
in
benchmark_dict
.
items
()
for
language_pair
in
language_pairs
}
########################################
# Language Specifics
########################################
def
zh_split
(
zh_text
:
List
[
str
])
->
List
[
str
]:
"""Chinese splitting"""
import
jieba
return
[
" "
.
join
(
jieba
.
cut
(
txt
.
strip
()))
for
txt
in
zh_text
]
def
ja_split
(
ja_text
:
List
[
str
])
->
List
[
str
]:
"""Japanese splitting"""
import
nagisa
return
[
" "
.
join
(
nagisa
.
tagging
(
txt
.
strip
()).
words
)
for
txt
in
ja_text
]
NO_SPACE_LANG
=
{
"zh"
:
zh_split
,
"ja"
:
ja_split
}
########################################
# Tasks
########################################
def
create_translation_task
(
dataset
,
language_pair
,
version
=
0
):
class
TranslationTask
(
GeneralTranslationTask
):
VERSION
=
version
def
__init__
(
self
):
super
().
__init__
(
dataset
,
language_pair
)
return
TranslationTask
class
GeneralTranslationTask
(
Task
):
VERSION
=
0
...
...
@@ -92,8 +105,9 @@ class GeneralTranslationTask(Task):
def
download
(
self
,
data_dir
=
None
,
cache_dir
=
None
,
download_mode
=
None
):
# This caches in the users home dir automatically
self
.
src_file
,
self
.
ref_file
=
\
sacrebleu
.
download_test_set
(
self
.
sacrebleu_dataset
,
self
.
sacrebleu_language_pair
)
self
.
src_file
,
self
.
ref_file
=
sacrebleu
.
download_test_set
(
self
.
sacrebleu_dataset
,
self
.
sacrebleu_language_pair
)
self
.
src_data
,
self
.
ref_data
=
[
[
line
.
rstrip
()
for
line
in
sacrebleu
.
smart_open
(
file
)]
for
file
in
(
self
.
src_file
,
self
.
ref_file
)
...
...
@@ -117,10 +131,9 @@ class GeneralTranslationTask(Task):
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return
[{
"src"
:
src
,
"ref"
:
ref
}
for
src
,
ref
in
zip
(
self
.
src_data
,
self
.
ref_data
)]
return
[
{
"src"
:
src
,
"ref"
:
ref
}
for
src
,
ref
in
zip
(
self
.
src_data
,
self
.
ref_data
)
]
def
doc_to_text
(
self
,
doc
):
language_codes
=
self
.
sacrebleu_language_pair
.
split
(
"-"
)
...
...
@@ -128,12 +141,18 @@ class GeneralTranslationTask(Task):
tar_lang
=
code_to_language
(
language_codes
[
1
])
return
f
"
{
src_lang
}
phrase: "
+
doc
[
"src"
]
+
f
"
\n
{
tar_lang
}
phrase:"
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"src"
]
def
doc_to_target
(
self
,
doc
):
# This shows a single target, though there may be multiple targets in a lang test
return
" "
+
doc
[
"ref"
]
if
isinstance
(
doc
[
"ref"
],
str
)
else
doc
[
"ref"
][
0
]
def
construct_requests
(
self
,
doc
,
ctx
):
"""
Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
...
...
lm_eval/tasks/triviaqa.py
View file @
1f8a8c1d
...
...
@@ -43,10 +43,10 @@ class TriviaQA(Task):
return
False
def
training_docs
(
self
):
return
self
.
dataset
[
'
train
'
]
return
self
.
dataset
[
"
train
"
]
def
validation_docs
(
self
):
return
self
.
dataset
[
'
validation
'
]
return
self
.
dataset
[
"
validation
"
]
def
test_docs
(
self
):
raise
NotImplementedError
()
...
...
@@ -54,8 +54,14 @@ class TriviaQA(Task):
def
doc_to_text
(
self
,
doc
):
return
f
"Question:
{
doc
[
'question'
]
}
\n
Answer:"
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"question"
]
def
doc_to_target
(
self
,
doc
):
return
" "
+
doc
[
'
answer
'
][
'
value
'
]
return
" "
+
doc
[
"
answer
"
][
"
value
"
]
def
_remove_prefixes
(
self
,
aliases
):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list
...
...
@@ -69,15 +75,13 @@ class TriviaQA(Task):
def
construct_requests
(
self
,
doc
,
ctx
):
ret
=
[]
for
alias
in
self
.
_remove_prefixes
(
doc
[
'
answer
'
][
'
aliases
'
]):
for
alias
in
self
.
_remove_prefixes
(
doc
[
"
answer
"
][
"
aliases
"
]):
_
,
is_prediction
=
rf
.
loglikelihood
(
ctx
,
" "
+
alias
)
ret
.
append
(
is_prediction
)
return
ret
def
process_results
(
self
,
doc
,
results
):
return
{
"acc"
:
float
(
any
(
results
))
}
return
{
"acc"
:
float
(
any
(
results
))}
def
aggregation
(
self
):
return
{
...
...
@@ -85,6 +89,4 @@ class TriviaQA(Task):
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
lm_eval/tasks/truthfulqa.py
View file @
1f8a8c1d
...
...
@@ -80,22 +80,29 @@ class TruthfulQAMultipleChoice(Task):
raise
NotImplementedError
()
def
doc_to_text
(
self
,
doc
):
return
QA_PROMPT
+
"
\n\n
Q: "
+
doc
[
'question'
]
+
"
\n
A:"
return
QA_PROMPT
+
"
\n\n
Q: "
+
doc
[
"question"
]
+
"
\n
A:"
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"question"
]
def
doc_to_target
(
self
,
doc
):
return
" "
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
=
None
,
rnd
=
None
,
description
=
None
):
assert
num_fewshot
==
0
,
"TruthfulQA is intended only for the zero-shot setting."
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
=
None
,
rnd
=
None
,
description
=
None
):
assert
(
num_fewshot
==
0
),
"TruthfulQA is intended only for the zero-shot setting."
return
super
().
fewshot_context
(
doc
=
doc
,
num_fewshot
=
num_fewshot
,
rnd
=
rnd
,
description
=
description
doc
=
doc
,
num_fewshot
=
num_fewshot
,
rnd
=
rnd
,
description
=
description
)
def
construct_requests
(
self
,
doc
,
ctx
):
"""
Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
...
...
@@ -105,11 +112,15 @@ class TruthfulQAMultipleChoice(Task):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
def
get_lls
(
targets
):
return
[
rf
.
loglikelihood
(
ctx
,
" "
+
t
)[
0
]
for
t
in
targets
]
# MC1 and MC2 targets are not always the same set of strings so we collect
# likelihoods separately for simpler processing.
return
get_lls
(
doc
[
'mc1_targets'
][
"choices"
])
+
get_lls
(
doc
[
'mc2_targets'
][
"choices"
])
return
get_lls
(
doc
[
"mc1_targets"
][
"choices"
])
+
get_lls
(
doc
[
"mc2_targets"
][
"choices"
]
)
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
...
...
@@ -121,37 +132,29 @@ class TruthfulQAMultipleChoice(Task):
:param results:
The results of the requests created in construct_requests.
"""
def
mc1
(
lls
):
# The gold answers in `mc1_targets` are always first (index = `0`).
return
np
.
argmax
(
lls
)
==
0
def
mc2
(
lls
):
# Split on the first `0` as everything before it is true (`1`).
split_idx
=
list
(
doc
[
'
mc2_targets
'
][
"labels"
]).
index
(
0
)
split_idx
=
list
(
doc
[
"
mc2_targets
"
][
"labels"
]).
index
(
0
)
# Compute the normalized probability mass for the correct answer.
ll_true
,
ll_false
=
lls
[:
split_idx
],
lls
[
split_idx
:]
p_true
,
p_false
=
np
.
exp
(
np
.
array
(
ll_true
)),
np
.
exp
(
np
.
array
(
ll_false
))
p_true
=
p_true
/
(
sum
(
p_true
)
+
sum
(
p_false
))
return
sum
(
p_true
)
split_idx
=
len
(
doc
[
'
mc1_targets
'
][
"choices"
])
split_idx
=
len
(
doc
[
"
mc1_targets
"
][
"choices"
])
mc1_lls
,
mc2_lls
=
results
[:
split_idx
],
results
[
split_idx
:]
return
{
"mc1"
:
mc1
(
mc1_lls
),
"mc2"
:
mc2
(
mc2_lls
)
}
return
{
"mc1"
:
mc1
(
mc1_lls
),
"mc2"
:
mc2
(
mc2_lls
)}
def
aggregation
(
self
):
return
{
"mc1"
:
mean
,
"mc2"
:
mean
}
return
{
"mc1"
:
mean
,
"mc2"
:
mean
}
def
higher_is_better
(
self
):
return
{
"mc1"
:
True
,
"mc2"
:
True
}
return
{
"mc1"
:
True
,
"mc2"
:
True
}
class
TruthfulQAGeneration
(
Task
):
...
...
@@ -181,44 +184,45 @@ class TruthfulQAGeneration(Task):
answer
=
answer
.
strip
()
if
len
(
answer
):
# Add a period after all answers.
if
answer
[
-
1
]
!=
'.'
:
formatted_answers
.
append
(
answer
+
'.'
)
if
answer
[
-
1
]
!=
"."
:
formatted_answers
.
append
(
answer
+
"."
)
else
:
formatted_answers
.
append
(
answer
)
return
formatted_answers
def
validation_docs
(
self
):
for
doc
in
self
.
dataset
[
"validation"
]:
incorrect_answers
=
self
.
_format_answers
(
doc
[
'
incorrect_answers
'
])
correct_answers
=
self
.
_format_answers
(
doc
[
'
correct_answers
'
])
incorrect_answers
=
self
.
_format_answers
(
doc
[
"
incorrect_answers
"
])
correct_answers
=
self
.
_format_answers
(
doc
[
"
correct_answers
"
])
if
"I have no comment."
not
in
correct_answers
:
correct_answers
.
append
(
"I have no comment."
)
yield
{
'
question
'
:
doc
[
'
question
'
].
strip
(),
'
correct_answers
'
:
correct_answers
,
'
incorrect_answers
'
:
incorrect_answers
"
question
"
:
doc
[
"
question
"
].
strip
(),
"
correct_answers
"
:
correct_answers
,
"
incorrect_answers
"
:
incorrect_answers
,
}
def
test_docs
(
self
):
raise
NotImplementedError
()
def
doc_to_text
(
self
,
doc
):
return
QA_PROMPT
+
"
\n\n
Q: "
+
doc
[
'
question
'
]
return
QA_PROMPT
+
"
\n\n
Q: "
+
doc
[
"
question
"
]
def
doc_to_target
(
self
,
doc
):
return
" "
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
=
None
,
rnd
=
None
,
description
=
None
):
assert
num_fewshot
==
0
,
"TruthfulQA is intended only for the zero-shot setting."
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
=
None
,
rnd
=
None
,
description
=
None
):
assert
(
num_fewshot
==
0
),
"TruthfulQA is intended only for the zero-shot setting."
return
super
().
fewshot_context
(
doc
=
doc
,
num_fewshot
=
num_fewshot
,
rnd
=
rnd
,
description
=
description
doc
=
doc
,
num_fewshot
=
num_fewshot
,
rnd
=
rnd
,
description
=
description
)
def
construct_requests
(
self
,
doc
,
ctx
):
"""
Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
...
...
@@ -229,7 +233,7 @@ class TruthfulQAGeneration(Task):
part of the document for `doc`.
"""
# TODO: Find a way to cap the number of generated tokens to `50` as in the official implementation.
completion
=
rf
.
greedy_until
(
ctx
,
[
'.'
])
completion
=
rf
.
greedy_until
(
ctx
,
[
"."
])
return
completion
def
process_results
(
self
,
doc
,
results
):
...
...
@@ -243,18 +247,18 @@ class TruthfulQAGeneration(Task):
The results of the requests created in construct_requests.
"""
completion
=
results
[
0
].
strip
()
true_refs
,
false_refs
=
doc
[
'
correct_answers
'
],
doc
[
'
incorrect_answers
'
]
true_refs
,
false_refs
=
doc
[
"
correct_answers
"
],
doc
[
"
incorrect_answers
"
]
all_refs
=
true_refs
+
false_refs
# Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
# BLEURT
bleurt_scores_true
=
self
.
bleurt
.
compute
(
predictions
=
[
completion
]
*
len
(
true_refs
),
references
=
true_refs
)[
'
scores
'
]
predictions
=
[
completion
]
*
len
(
true_refs
),
references
=
true_refs
)[
"
scores
"
]
bleurt_scores_false
=
self
.
bleurt
.
compute
(
predictions
=
[
completion
]
*
len
(
false_refs
),
references
=
false_refs
)[
'
scores
'
]
predictions
=
[
completion
]
*
len
(
false_refs
),
references
=
false_refs
)[
"
scores
"
]
bleurt_correct
=
max
(
bleurt_scores_true
)
bleurt_incorrect
=
max
(
bleurt_scores_false
)
bleurt_max
=
bleurt_correct
...
...
@@ -263,8 +267,8 @@ class TruthfulQAGeneration(Task):
# BLEU
bleu_scores
=
[
self
.
bleu
([[
ref
]],
[
completion
])
for
ref
in
all_refs
]
bleu_correct
=
np
.
nanmax
(
bleu_scores
[:
len
(
true_refs
)])
bleu_incorrect
=
np
.
nanmax
(
bleu_scores
[
len
(
true_refs
):])
bleu_correct
=
np
.
nanmax
(
bleu_scores
[:
len
(
true_refs
)])
bleu_incorrect
=
np
.
nanmax
(
bleu_scores
[
len
(
true_refs
)
:])
bleu_max
=
bleu_correct
bleu_diff
=
bleu_correct
-
bleu_incorrect
bleu_acc
=
int
(
bleu_correct
>
bleu_incorrect
)
...
...
@@ -272,23 +276,23 @@ class TruthfulQAGeneration(Task):
# ROUGE-N
rouge_scores
=
[
self
.
rouge
([
ref
],
[
completion
])
for
ref
in
all_refs
]
# ROUGE-1
rouge1_scores
=
[
score
[
'
rouge1
'
]
for
score
in
rouge_scores
]
rouge1_correct
=
np
.
nanmax
(
rouge1_scores
[:
len
(
true_refs
)])
rouge1_incorrect
=
np
.
nanmax
(
rouge1_scores
[
len
(
true_refs
):])
rouge1_scores
=
[
score
[
"
rouge1
"
]
for
score
in
rouge_scores
]
rouge1_correct
=
np
.
nanmax
(
rouge1_scores
[:
len
(
true_refs
)])
rouge1_incorrect
=
np
.
nanmax
(
rouge1_scores
[
len
(
true_refs
)
:])
rouge1_max
=
rouge1_correct
rouge1_diff
=
rouge1_correct
-
rouge1_incorrect
rouge1_acc
=
int
(
rouge1_correct
>
rouge1_incorrect
)
# ROUGE-2
rouge2_scores
=
[
score
[
'
rouge2
'
]
for
score
in
rouge_scores
]
rouge2_correct
=
np
.
nanmax
(
rouge2_scores
[:
len
(
true_refs
)])
rouge2_incorrect
=
np
.
nanmax
(
rouge2_scores
[
len
(
true_refs
):])
rouge2_scores
=
[
score
[
"
rouge2
"
]
for
score
in
rouge_scores
]
rouge2_correct
=
np
.
nanmax
(
rouge2_scores
[:
len
(
true_refs
)])
rouge2_incorrect
=
np
.
nanmax
(
rouge2_scores
[
len
(
true_refs
)
:])
rouge2_max
=
rouge2_correct
rouge2_diff
=
rouge2_correct
-
rouge2_incorrect
rouge2_acc
=
int
(
rouge2_correct
>
rouge2_incorrect
)
# ROUGE-L
rougeL_scores
=
[
score
[
'
rougeLsum
'
]
for
score
in
rouge_scores
]
rougeL_correct
=
np
.
nanmax
(
rougeL_scores
[:
len
(
true_refs
)])
rougeL_incorrect
=
np
.
nanmax
(
rougeL_scores
[
len
(
true_refs
):])
rougeL_scores
=
[
score
[
"
rougeLsum
"
]
for
score
in
rouge_scores
]
rougeL_correct
=
np
.
nanmax
(
rougeL_scores
[:
len
(
true_refs
)])
rougeL_incorrect
=
np
.
nanmax
(
rougeL_scores
[
len
(
true_refs
)
:])
rougeL_max
=
rougeL_correct
rougeL_diff
=
rougeL_correct
-
rougeL_incorrect
rougeL_acc
=
int
(
rougeL_correct
>
rougeL_incorrect
)
...
...
@@ -297,19 +301,15 @@ class TruthfulQAGeneration(Task):
"bleurt_max"
:
bleurt_max
,
"bleurt_acc"
:
bleurt_acc
,
"bleurt_diff"
:
bleurt_diff
,
"bleu_max"
:
bleu_max
,
"bleu_acc"
:
bleu_acc
,
"bleu_diff"
:
bleu_diff
,
"rouge1_max"
:
rouge1_max
,
"rouge1_acc"
:
rouge1_acc
,
"rouge1_diff"
:
rouge1_diff
,
"rouge2_max"
:
rouge2_max
,
"rouge2_acc"
:
rouge2_acc
,
"rouge2_diff"
:
rouge2_diff
,
"rougeL_max"
:
rougeL_max
,
"rougeL_acc"
:
rougeL_acc
,
"rougeL_diff"
:
rougeL_diff
,
...
...
@@ -320,19 +320,15 @@ class TruthfulQAGeneration(Task):
"bleurt_max"
:
mean
,
"bleurt_acc"
:
mean
,
"bleurt_diff"
:
mean
,
"bleu_max"
:
mean
,
"bleu_acc"
:
mean
,
"bleu_diff"
:
mean
,
"rouge1_max"
:
mean
,
"rouge1_acc"
:
mean
,
"rouge1_diff"
:
mean
,
"rouge2_max"
:
mean
,
"rouge2_acc"
:
mean
,
"rouge2_diff"
:
mean
,
"rougeL_max"
:
mean
,
"rougeL_acc"
:
mean
,
"rougeL_diff"
:
mean
,
...
...
@@ -343,19 +339,15 @@ class TruthfulQAGeneration(Task):
"bleurt_max"
:
True
,
"bleurt_acc"
:
True
,
"bleurt_diff"
:
True
,
"bleu_max"
:
True
,
"bleu_acc"
:
True
,
"bleu_diff"
:
True
,
"rouge1_max"
:
True
,
"rouge1_acc"
:
True
,
"rouge1_diff"
:
True
,
"rouge2_max"
:
True
,
"rouge2_acc"
:
True
,
"rouge2_diff"
:
True
,
"rougeL_max"
:
True
,
"rougeL_acc"
:
True
,
"rougeL_diff"
:
True
,
...
...
@@ -379,7 +371,7 @@ class TruthfulQAGeneration(Task):
force
=
False
,
lowercase
=
False
,
tokenize
=
"intl"
,
use_effective_order
=
False
use_effective_order
=
False
,
).
score
return
score
...
...
@@ -396,9 +388,11 @@ class TruthfulQAGeneration(Task):
rouge_types
=
[
"rouge1"
,
"rouge2"
,
"rougeLsum"
]
scorer
=
rouge_scorer
.
RougeScorer
(
rouge_types
)
# Add newlines between sentences to correctly compute `rougeLsum`.
def
_prepare_summary
(
summary
):
summary
=
summary
.
replace
(
" . "
,
".
\n
"
)
return
summary
# Accumulate confidence intervals.
aggregator
=
scoring
.
BootstrapAggregator
()
for
ref
,
pred
in
zip
(
refs
,
preds
):
...
...
@@ -406,4 +400,4 @@ class TruthfulQAGeneration(Task):
pred
=
_prepare_summary
(
pred
)
aggregator
.
add_scores
(
scorer
.
score
(
ref
,
pred
))
result
=
aggregator
.
aggregate
()
return
{
type
:
result
[
type
].
mid
.
fmeasure
*
100
for
type
in
rouge_types
}
return
{
type
:
result
[
type
].
mid
.
fmeasure
*
100
for
type
in
rouge_types
}
lm_eval/tasks/unscramble.py
View file @
1f8a8c1d
...
...
@@ -49,6 +49,12 @@ class WordUnscrambleTask(Task):
def
doc_to_text
(
self
,
doc
):
return
doc
[
"context"
]
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"context"
]
def
doc_to_target
(
self
,
doc
):
return
doc
[
"completion"
]
...
...
@@ -59,19 +65,13 @@ class WordUnscrambleTask(Task):
def
process_results
(
self
,
doc
,
results
):
pred
=
results
[
0
]
gold
=
doc
[
"completion"
]
return
{
"acc"
:
int
(
pred
==
gold
)
}
return
{
"acc"
:
int
(
pred
==
gold
)}
def
aggregation
(
self
):
return
{
"acc"
:
mean
}
return
{
"acc"
:
mean
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
class
Anagrams1
(
WordUnscrambleTask
):
...
...
lm_eval/tasks/webqs.py
View file @
1f8a8c1d
...
...
@@ -54,13 +54,19 @@ class WebQs(Task):
return
self
.
dataset
[
"test"
]
def
doc_to_text
(
self
,
doc
):
return
"Question: "
+
doc
[
'question'
]
+
'
\n
Answer:'
return
"Question: "
+
doc
[
"question"
]
+
"
\n
Answer:"
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"question"
]
def
doc_to_target
(
self
,
doc
):
# this picks one answer to be the "correct" one, despite sometimes
# multiple correct answers being possible.
# TODO: make sure we're actually handling multi-answer correctly
return
" "
+
doc
[
'
answers
'
][
0
]
return
" "
+
doc
[
"
answers
"
][
0
]
def
_remove_prefixes
(
self
,
aliases
):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list
...
...
@@ -75,15 +81,13 @@ class WebQs(Task):
def
construct_requests
(
self
,
doc
,
ctx
):
ret
=
[]
for
alias
in
self
.
_remove_prefixes
(
doc
[
'
answers
'
]):
for
alias
in
self
.
_remove_prefixes
(
doc
[
"
answers
"
]):
_
,
is_prediction
=
rf
.
loglikelihood
(
ctx
,
" "
+
alias
)
ret
.
append
(
is_prediction
)
return
ret
def
process_results
(
self
,
doc
,
results
):
return
{
"acc"
:
float
(
any
(
results
))
}
return
{
"acc"
:
float
(
any
(
results
))}
def
aggregation
(
self
):
return
{
...
...
@@ -91,6 +95,4 @@ class WebQs(Task):
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
lm_eval/tasks/wikitext.py
View file @
1f8a8c1d
...
...
@@ -90,6 +90,9 @@ class WikiText(PerplexityTask):
def
doc_to_target
(
self
,
doc
):
return
wikitext_detokenizer
(
doc
)
def
should_decontaminate
(
self
):
return
True
def
count_words
(
self
,
doc
):
# count number of words in *original doc before detokenization*
return
len
(
re
.
split
(
r
"\s+"
,
doc
))
lm_eval/tasks/winogrande.py
View file @
1f8a8c1d
...
...
@@ -34,7 +34,7 @@ class Winogrande(Task):
DATASET_PATH
=
"winogrande"
DATASET_NAME
=
"winogrande_xl"
answer_to_num
=
{
'1'
:
0
,
'2'
:
1
}
answer_to_num
=
{
"1"
:
0
,
"2"
:
1
}
def
has_training_docs
(
self
):
return
True
...
...
@@ -56,6 +56,12 @@ class Winogrande(Task):
def
doc_to_text
(
self
,
doc
):
return
self
.
partial_context
(
doc
,
doc
[
"option"
+
doc
[
"answer"
]])
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"sentence"
]
@
classmethod
def
partial_context
(
cls
,
doc
,
option
):
# Substitute the pronoun in the sentence with the specified option
...
...
@@ -107,9 +113,7 @@ class Winogrande(Task):
:param results:
The results of the requests created in construct_requests.
"""
return
{
"acc"
:
np
.
argmax
(
results
)
==
self
.
answer_to_num
[
doc
[
"answer"
]]
}
return
{
"acc"
:
np
.
argmax
(
results
)
==
self
.
answer_to_num
[
doc
[
"answer"
]]}
def
aggregation
(
self
):
"""
...
...
@@ -117,9 +121,7 @@ class Winogrande(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"acc"
:
mean
}
return
{
"acc"
:
mean
}
def
higher_is_better
(
self
):
"""
...
...
@@ -127,6 +129,4 @@ class Winogrande(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
lm_eval/tasks/wsc273.py
View file @
1f8a8c1d
...
...
@@ -40,8 +40,19 @@ class WinogradSchemaChallenge273(Task):
DATASET_PATH
=
"winograd_wsc"
DATASET_NAME
=
"wsc273"
upper_pronouns
=
[
"A"
,
"An"
,
"The"
,
"She"
,
"He"
,
"It"
,
"They"
,
"My"
,
"His"
,
"Her"
,
"Their"
]
upper_pronouns
=
[
"A"
,
"An"
,
"The"
,
"She"
,
"He"
,
"It"
,
"They"
,
"My"
,
"His"
,
"Her"
,
"Their"
,
]
def
has_training_docs
(
self
):
return
False
...
...
@@ -68,7 +79,7 @@ class WinogradSchemaChallenge273(Task):
option
+=
"'s"
# Appropriately lowercase the pronoun in the option.
pronoun
=
option
.
split
()[
0
]
start_of_sentence
=
doc
[
"text"
][
doc
[
'
pronoun_loc
'
]
-
2
]
==
'.'
start_of_sentence
=
doc
[
"text"
][
doc
[
"
pronoun_loc
"
]
-
2
]
==
"."
if
not
start_of_sentence
and
pronoun
in
self
.
upper_pronouns
:
return
option
.
replace
(
pronoun
,
pronoun
.
lower
())
return
option
...
...
@@ -85,11 +96,17 @@ class WinogradSchemaChallenge273(Task):
def
doc_to_text
(
self
,
doc
):
return
self
.
partial_context
(
doc
,
doc
[
"options"
][
doc
[
"label"
]])
def
should_decontaminate
(
self
):
return
True
def
doc_to_decontamination_query
(
self
,
doc
):
return
doc
[
"text"
]
@
classmethod
def
partial_context
(
cls
,
doc
,
option
):
# Substitute the pronoun in the original text with the specified
# option and ignore everything after.
return
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
option
return
doc
[
"text"
][:
doc
[
"pronoun_loc"
]]
+
option
def
doc_to_target
(
self
,
doc
):
return
self
.
partial_target
(
doc
)
...
...
@@ -135,9 +152,7 @@ class WinogradSchemaChallenge273(Task):
:param results:
The results of the requests created in construct_requests.
"""
return
{
"acc"
:
np
.
argmax
(
results
)
==
doc
[
"label"
]
}
return
{
"acc"
:
np
.
argmax
(
results
)
==
doc
[
"label"
]}
def
aggregation
(
self
):
"""
...
...
@@ -145,9 +160,7 @@ class WinogradSchemaChallenge273(Task):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return
{
"acc"
:
mean
}
return
{
"acc"
:
mean
}
def
higher_is_better
(
self
):
"""
...
...
@@ -155,6 +168,4 @@ class WinogradSchemaChallenge273(Task):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return
{
"acc"
:
True
}
return
{
"acc"
:
True
}
lm_eval/utils.py
View file @
1f8a8c1d
...
...
@@ -34,6 +34,7 @@ def simple_parse_args_string(args_string):
args_dict
[
k
]
=
v
return
args_dict
def
join_iters
(
iters
):
for
iter
in
iters
:
yield
from
iter
...
...
@@ -47,7 +48,9 @@ def chunks(iter, n):
yield
arr
arr
=
[]
if
arr
:
yield
arr
if
arr
:
yield
arr
def
group
(
arr
,
fn
):
res
=
collections
.
defaultdict
(
list
)
...
...
@@ -57,12 +60,13 @@ def group(arr, fn):
return
list
(
res
.
values
())
def
general_detokenize
(
string
):
string
=
string
.
replace
(
" n't"
,
"n't"
)
string
=
string
.
replace
(
" )"
,
")"
)
string
=
string
.
replace
(
"( "
,
"("
)
string
=
string
.
replace
(
"
\
"
"
,
"
\"
"
)
string
=
string
.
replace
(
"
\"
"
,
"
\"
"
)
string
=
string
.
replace
(
'
"
'
,
'"'
)
string
=
string
.
replace
(
' "'
,
'"'
)
string
=
re
.
sub
(
r
" (['.,])"
,
r
"\1"
,
string
)
return
string
...
...
@@ -94,10 +98,7 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
# Special handling for first window: predict all tokens
first_seq_len
=
min
(
max_seq_len
,
len
(
token_list
))
yield
(
[
prefix_token
]
+
token_list
[:
first_seq_len
-
1
],
token_list
[:
first_seq_len
]
)
yield
([
prefix_token
]
+
token_list
[:
first_seq_len
-
1
],
token_list
[:
first_seq_len
])
predicted
+=
first_seq_len
while
predicted
<
len
(
token_list
):
...
...
@@ -105,31 +106,30 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
window_end
=
predicted
+
window_pred_len
yield
(
token_list
[
window_end
-
max_seq_len
-
1
:
window_end
-
1
],
token_list
[
window_end
-
window_pred_len
:
window_end
],
token_list
[
window_end
-
max_seq_len
-
1
:
window_end
-
1
],
token_list
[
window_end
-
window_pred_len
:
window_end
],
)
predicted
+=
window_pred_len
def
make_disjoint_window
(
pair
):
"""
Takes output from get_rolling_token_windows and makes the context not overlap with the continuation
"""
"""Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
a
,
b
=
pair
return
a
[:
-
(
len
(
b
)
-
1
)],
b
return
a
[:
-
(
len
(
b
)
-
1
)],
b
class
Reorderer
:
def
__init__
(
self
,
arr
,
fn
):
self
.
size
=
len
(
arr
)
arr
=
list
(
enumerate
(
arr
))
arr
=
group
(
arr
,
lambda
x
:
fn
(
x
[
1
]))
arr
=
[
([
y
[
0
]
for
y
in
x
],
x
[
0
][
1
])
for
x
in
arr
]
arr
=
[([
y
[
0
]
for
y
in
x
],
x
[
0
][
1
])
for
x
in
arr
]
arr
.
sort
(
key
=
lambda
x
:
fn
(
x
[
1
]))
self
.
arr
=
arr
def
get_reordered
(
self
):
return
[
x
[
1
]
for
x
in
self
.
arr
]
...
...
@@ -146,20 +146,26 @@ class Reorderer:
return
res
def
positional_deprecated
(
fn
):
"""
A decorator to nudge users into passing only keyword args (`kwargs`) to the
wrapped function, `fn`.
"""
@
functools
.
wraps
(
fn
)
def
_wrapper
(
*
args
,
**
kwargs
):
if
len
(
args
)
!=
1
if
inspect
.
ismethod
(
fn
)
else
0
:
print
(
f
"WARNING: using
{
fn
.
__name__
}
with positional arguments is "
print
(
f
"WARNING: using
{
fn
.
__name__
}
with positional arguments is "
"deprecated and will be disallowed in a future version of "
"lm-evaluation-harness!"
)
"lm-evaluation-harness!"
)
return
fn
(
*
args
,
**
kwargs
)
return
_wrapper
@
positional_deprecated
def
find_test_root
(
start_path
:
pathlib
.
Path
)
->
pathlib
.
Path
:
"""
...
...
@@ -169,12 +175,14 @@ def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
cur_path
=
start_path
.
resolve
()
max_layers
=
3
for
_
in
range
(
max_layers
):
if
(
cur_path
/
'
tests
'
/
'
test_version_stable.py
'
).
exists
():
if
(
cur_path
/
"
tests
"
/
"
test_version_stable.py
"
).
exists
():
return
cur_path
else
:
cur_path
=
cur_path
.
parent
.
resolve
()
raise
FileNotFoundError
(
f
"Unable to find package root within
{
max_layers
}
upwards"
+
\
f
"of
{
start_path
}
"
)
raise
FileNotFoundError
(
f
"Unable to find package root within
{
max_layers
}
upwards"
+
f
"of
{
start_path
}
"
)
@
positional_deprecated
def
run_task_tests
(
task_list
:
List
[
str
]):
...
...
@@ -182,9 +190,16 @@ def run_task_tests(task_list: List[str]):
Find the package root and run the tests for the given tasks
"""
package_root
=
find_test_root
(
start_path
=
pathlib
.
Path
(
__file__
))
task_string
=
' or '
.
join
(
task_list
)
args
=
[
f
'
{
package_root
}
/tests/test_version_stable.py'
,
f
'--rootdir=
{
package_root
}
'
,
'-k'
,
f
'
{
task_string
}
'
]
task_string
=
" or "
.
join
(
task_list
)
args
=
[
f
"
{
package_root
}
/tests/test_version_stable.py"
,
f
"--rootdir=
{
package_root
}
"
,
"-k"
,
f
"
{
task_string
}
"
,
]
sys
.
path
.
append
(
str
(
package_root
))
pytest_return_val
=
pytest
.
main
(
args
)
if
pytest_return_val
:
raise
ValueError
(
f
"Not all tests for the specified tasks (
{
task_list
}
) ran successfully! Error code:
{
pytest_return_val
}
"
)
\ No newline at end of file
raise
ValueError
(
f
"Not all tests for the specified tasks (
{
task_list
}
) ran successfully! Error code:
{
pytest_return_val
}
"
)
main.py
View file @
1f8a8c1d
import
argparse
import
json
import
logging
import
fnmatch
from
lm_eval
import
tasks
,
evaluator
logging
.
getLogger
(
"openai"
).
setLevel
(
logging
.
WARNING
)
class
MultiChoice
:
def
__init__
(
self
,
choices
):
self
.
choices
=
choices
# Simple wildcard support (linux filename patterns)
def
__contains__
(
self
,
values
):
for
value
in
values
.
split
(
","
):
if
len
(
fnmatch
.
filter
(
self
.
choices
,
value
))
==
0
:
return
False
return
True
def
__iter__
(
self
):
for
choice
in
self
.
choices
:
yield
choice
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--model'
,
required
=
True
)
parser
.
add_argument
(
'--model_args'
,
default
=
""
)
parser
.
add_argument
(
'--tasks'
,
default
=
"all_tasks"
)
parser
.
add_argument
(
'--provide_description'
,
action
=
"store_true"
)
parser
.
add_argument
(
'--num_fewshot'
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
'--device'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--output_path'
,
default
=
None
)
parser
.
add_argument
(
'--limit'
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
'--no_cache'
,
action
=
"store_true"
)
parser
.
add_argument
(
'--description_dict_path'
,
default
=
None
)
parser
.
add_argument
(
'--check_integrity'
,
action
=
"store_true"
)
parser
.
add_argument
(
"--model"
,
required
=
True
)
parser
.
add_argument
(
"--model_args"
,
default
=
""
)
parser
.
add_argument
(
"--tasks"
,
default
=
None
,
choices
=
MultiChoice
(
tasks
.
ALL_TASKS
))
parser
.
add_argument
(
"--provide_description"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--output_path"
,
default
=
None
)
parser
.
add_argument
(
"--limit"
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
"--no_cache"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--decontamination_ngrams_path"
,
default
=
None
)
parser
.
add_argument
(
"--description_dict_path"
,
default
=
None
)
parser
.
add_argument
(
"--check_integrity"
,
action
=
"store_true"
)
return
parser
.
parse_args
()
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def
pattern_match
(
patterns
,
source_list
):
task_names
=
set
()
for
pattern
in
patterns
:
for
matching
in
fnmatch
.
filter
(
source_list
,
pattern
):
task_names
.
add
(
matching
)
return
list
(
task_names
)
def
main
():
args
=
parse_args
()
assert
not
args
.
provide_description
# not implemented
if
args
.
limit
:
print
(
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
print
(
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
if
args
.
tasks
==
"all_tasks"
:
if
args
.
tasks
is
None
:
task_names
=
tasks
.
ALL_TASKS
else
:
task_names
=
args
.
tasks
.
split
(
","
)
task_names
=
pattern_match
(
args
.
tasks
.
split
(
","
),
tasks
.
ALL_TASKS
)
print
(
f
"Selected Tasks:
{
task_names
}
"
)
description_dict
=
{}
if
args
.
description_dict_path
:
with
open
(
args
.
description_dict_path
,
'r'
)
as
f
:
with
open
(
args
.
description_dict_path
,
"r"
)
as
f
:
description_dict
=
json
.
load
(
f
)
results
=
evaluator
.
simple_evaluate
(
...
...
@@ -51,11 +86,11 @@ def main():
no_cache
=
args
.
no_cache
,
limit
=
args
.
limit
,
description_dict
=
description_dict
,
check_integrity
=
args
.
check_integrity
decontamination_ngrams_path
=
args
.
decontamination_ngrams_path
,
check_integrity
=
args
.
check_integrity
,
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
)
print
(
dumped
)
if
args
.
output_path
:
...
...
pile_statistics.json
0 → 100644
View file @
1f8a8c1d
{
"Data"
:
"Pile statistics"
,
"Document Count"
:
210607728
,
"Total Pile Characters"
:
421215456
,
"File Start Offsets"
:
[
0
,
7021438
,
14042822
,
21066113
,
28086515
,
35106072
,
42123306
,
49145091
,
56165817
,
63185587
,
70211208
,
77234322
,
84249267
,
91267634
,
98285983
,
105305110
,
112322489
,
119342491
,
126367373
,
133389153
,
140412039
,
147432373
,
154452516
,
161470190
,
168492733
,
175512521
,
182526939
,
189547478
,
196565318
,
203583306
]
}
scripts/clean_training_data/README.md
View file @
1f8a8c1d
...
...
@@ -5,7 +5,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
1) Collects all contamination text files that are to be removed from training data
2) Filters training data by finding
`N`
gram matches between the training data
and any contamination
1)
`N`
grams ignore case and punctation and are split on whitespace.
1)
`N`
grams ignore case and punct
u
ation and are split on whitespace.
2) Matching
`N`
gram substrings are removed, as is a
`window_to_remove`
character window around
the match, splitting the training data into chunks
3) Any chunks less than
`minimum_slice_length`
are removed
...
...
@@ -20,7 +20,7 @@ minimum_slice_length = 200
too_dirty_cutoff = 10
```
## Compling
## Comp
i
ling
Janitor can be used as a pure python program, but it is much faster if the ngram
code is run in C++. To compile the C++ code, run
...
...
@@ -31,4 +31,3 @@ c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor
```
If your your compiler isn't linked to python, you may need to add to the above
`-undefined dynamic_lookup`
scripts/clean_training_data/compress_and_package.py
0 → 100644
View file @
1f8a8c1d
import
glob
import
argparse
import
os
import
subprocess
import
shutil
from
tqdm
import
tqdm
from
tqdm_multiprocess
import
TqdmMultiProcessPool
import
logging
from
tqdm_multiprocess.logger
import
setup_logger_tqdm
logger
=
logging
.
getLogger
(
__name__
)
def
process_task
(
working_directory
,
output_directory
,
bucket_file_path
,
tqdm_func
,
global_tqdm
):
command
=
f
"zstd
{
bucket_file_path
}
"
logger
.
info
(
command
)
subprocess
.
call
(
command
,
shell
=
True
)
compressed_file
=
bucket_file_path
+
".zst"
if
output_directory
:
shutil
.
move
(
compressed_file
,
output_directory
)
os
.
remove
(
bucket_file_path
)
global_tqdm
.
update
()
def
compress_and_move
(
working_directory
,
output_directory
,
process_count
):
os
.
makedirs
(
output_directory
,
exist_ok
=
True
)
original_info_file_path
=
os
.
path
.
join
(
working_directory
,
"info.json"
)
assert
os
.
path
.
exists
(
original_info_file_path
)
tasks
=
[]
bucket_file_paths
=
glob
.
glob
(
os
.
path
.
join
(
working_directory
,
"output"
,
f
"*.bkt.txt.sorted"
)
)
for
bucket_file_path
in
bucket_file_paths
:
task
=
(
process_task
,
(
working_directory
,
output_directory
,
bucket_file_path
))
tasks
.
append
(
task
)
pool
=
TqdmMultiProcessPool
(
process_count
)
def
on_done
(
_
):
return
None
def
on_error
(
_
):
return
None
global_progress
=
tqdm
(
total
=
len
(
bucket_file_paths
),
dynamic_ncols
=
True
,
unit
=
"file"
)
_
=
pool
.
map
(
global_progress
,
tasks
,
on_error
,
on_done
)
shutil
.
copy
(
original_info_file_path
,
os
.
path
.
join
(
output_directory
,
"info.json"
))
parser
=
argparse
.
ArgumentParser
(
description
=
"sort 13gram buckets"
)
parser
.
add_argument
(
"-dir"
,
"--working_directory"
,
required
=
True
)
parser
.
add_argument
(
"-output"
,
"--output_directory"
,
required
=
True
)
parser
.
add_argument
(
"-procs"
,
"--process_count"
,
type
=
int
,
default
=
8
)
if
__name__
==
"__main__"
:
version
=
1.00
print
(
f
"Running version
{
version
}
"
)
logfile_path
=
"compress_and_package.log"
setup_logger_tqdm
(
logfile_path
)
args
=
parser
.
parse_args
()
compress_and_move
(
args
.
working_directory
,
args
.
output_directory
,
args
.
process_count
)
Prev
1
2
3
4
5
6
7
8
9
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment