Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
d27c0c08
Unverified
Commit
d27c0c08
authored
Feb 26, 2024
by
LSinev
Committed by
GitHub
Feb 26, 2024
Browse files
Apply code autoformatting with Ruff to tasks/*.py an *__init__.py (#1469)
parent
f78e2da4
Changes
48
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
124 additions
and
54 deletions
+124
-54
lm_eval/tasks/hellaswag/utils.py
lm_eval/tasks/hellaswag/utils.py
+2
-1
lm_eval/tasks/ifeval/instructions.py
lm_eval/tasks/ifeval/instructions.py
+2
-0
lm_eval/tasks/ifeval/instructions_registry.py
lm_eval/tasks/ifeval/instructions_registry.py
+1
-0
lm_eval/tasks/kobest/utils.py
lm_eval/tasks/kobest/utils.py
+13
-2
lm_eval/tasks/medmcqa/utils_medmcqa.py
lm_eval/tasks/medmcqa/utils_medmcqa.py
+6
-1
lm_eval/tasks/medqa/preprocess_medqa.py
lm_eval/tasks/medqa/preprocess_medqa.py
+6
-1
lm_eval/tasks/mgsm/utils.py
lm_eval/tasks/mgsm/utils.py
+5
-3
lm_eval/tasks/minerva_math/utils.py
lm_eval/tasks/minerva_math/utils.py
+6
-3
lm_eval/tasks/mmlu/_generate_configs.py
lm_eval/tasks/mmlu/_generate_configs.py
+3
-3
lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
+25
-12
lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
+25
-12
lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
...model_written_evals/advanced_ai_risk/_generate_configs.py
+1
-2
lm_eval/tasks/model_written_evals/persona/_generate_configs.py
...al/tasks/model_written_evals/persona/_generate_configs.py
+1
-2
lm_eval/tasks/okapi/arc_multilingual/utils.py
lm_eval/tasks/okapi/arc_multilingual/utils.py
+9
-2
lm_eval/tasks/okapi/hellaswag_multilingual/utils.py
lm_eval/tasks/okapi/hellaswag_multilingual/utils.py
+2
-1
lm_eval/tasks/okapi/mmlu_multilingual/_generate_configs.py
lm_eval/tasks/okapi/mmlu_multilingual/_generate_configs.py
+8
-4
lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
+4
-1
lm_eval/tasks/qasper/metrics.py
lm_eval/tasks/qasper/metrics.py
+0
-1
lm_eval/tasks/qasper/utils.py
lm_eval/tasks/qasper/utils.py
+2
-1
lm_eval/tasks/realtoxicityprompts/metric.py
lm_eval/tasks/realtoxicityprompts/metric.py
+3
-2
No files found.
lm_eval/tasks/hellaswag/utils.py
View file @
d27c0c08
import
datasets
import
re
import
datasets
def
preprocess
(
text
):
text
=
text
.
strip
()
...
...
lm_eval/tasks/ifeval/instructions.py
View file @
d27c0c08
...
...
@@ -22,8 +22,10 @@ import string
from
typing
import
Dict
,
Optional
,
Sequence
,
Union
import
langdetect
from
lm_eval.tasks.ifeval
import
instructions_util
logger
=
logging
.
getLogger
(
__name__
)
_InstructionArgsDtype
=
Optional
[
Dict
[
str
,
Union
[
int
,
str
,
Sequence
[
str
]]]]
...
...
lm_eval/tasks/ifeval/instructions_registry.py
View file @
d27c0c08
...
...
@@ -15,6 +15,7 @@
"""Registry of all instructions."""
from
lm_eval.tasks.ifeval
import
instructions
_KEYWORD
=
"keywords:"
_LANGUAGE
=
"language:"
...
...
lm_eval/tasks/kobest/utils.py
View file @
d27c0c08
...
...
@@ -6,32 +6,43 @@ def copa_doc_to_text(doc: dict) -> str:
connector
=
{
"원인"
:
" 왜냐하면"
,
"결과"
:
" 그래서"
}[
doc
[
"question"
].
strip
()]
return
f
"""
{
doc
[
"premise"
]
}
{
connector
}
"""
def
copa_doc_to_target
(
doc
:
dict
)
->
str
:
correct_choice
=
doc
[
"alternative_1"
]
if
doc
[
"label"
]
==
0
else
doc
[
"alternative_2"
]
return
f
"""
{
correct_choice
}
"""
def
copa_doc_to_choice
(
doc
:
dict
)
->
list
:
return
[
f
"""
{
doc
[
"alternative_1"
]
}
"""
,
f
"""
{
doc
[
"alternative_2"
]
}
"""
]
def
sentineg_doc_to_text
(
doc
:
dict
):
return
f
"""문장:
{
doc
[
"sentence"
]
}
긍부정:"""
def
wic_doc_to_text
(
doc
:
dict
)
->
str
:
return
f
"""문장1:
{
doc
[
"context_1"
]
}
문장2:
{
doc
[
"context_2"
]
}
두 문장에서
{
doc
[
"word"
]
}
가 같은 뜻으로 쓰였나?"""
def
hellaswag_process_doc
(
doc
:
Dataset
)
->
Dataset
:
def
preprocessor
(
dataset
):
return
{
"query"
:
f
"""문장:
{
dataset
[
"context"
]
}
"""
,
"choices"
:
[
dataset
[
"ending_1"
],
dataset
[
"ending_2"
],
dataset
[
"ending_3"
],
dataset
[
"ending_4"
]],
"choices"
:
[
dataset
[
"ending_1"
],
dataset
[
"ending_2"
],
dataset
[
"ending_3"
],
dataset
[
"ending_4"
],
],
"gold"
:
int
(
dataset
[
"label"
]),
}
return
doc
.
map
(
preprocessor
)
def
macro_f1_score
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
fscore
=
f1_score
(
golds
,
preds
,
average
=
'
macro
'
)
fscore
=
f1_score
(
golds
,
preds
,
average
=
"
macro
"
)
return
fscore
lm_eval/tasks/medmcqa/utils_medmcqa.py
View file @
d27c0c08
...
...
@@ -10,7 +10,12 @@ def doc_to_text(doc) -> str:
Answer:
"""
choices
=
[
doc
[
"opa"
],
doc
[
"opb"
],
doc
[
"opc"
],
doc
[
"opd"
]]
option_choices
=
{
'A'
:
choices
[
0
],
'B'
:
choices
[
1
],
'C'
:
choices
[
2
],
'D'
:
choices
[
3
]}
option_choices
=
{
"A"
:
choices
[
0
],
"B"
:
choices
[
1
],
"C"
:
choices
[
2
],
"D"
:
choices
[
3
],
}
prompt
=
"Question: "
+
doc
[
"question"
]
+
"
\n
Choices:
\n
"
for
choice
,
option
in
option_choices
.
items
():
...
...
lm_eval/tasks/medqa/preprocess_medqa.py
View file @
d27c0c08
def
doc_to_text
(
doc
)
->
str
:
option_choices
=
{
'A'
:
doc
[
"ending0"
],
'B'
:
doc
[
"ending1"
],
'C'
:
doc
[
"ending2"
],
'D'
:
doc
[
"ending3"
]}
option_choices
=
{
"A"
:
doc
[
"ending0"
],
"B"
:
doc
[
"ending1"
],
"C"
:
doc
[
"ending2"
],
"D"
:
doc
[
"ending3"
],
}
answers
=
""
.
join
((
f
"
{
k
}
.
{
v
}
\n
"
)
for
k
,
v
in
option_choices
.
items
())
return
f
"Question:
{
doc
[
'sent1'
]
}
\n
{
answers
}
Answer:"
...
...
lm_eval/tasks/mgsm/utils.py
View file @
d27c0c08
import
yaml
import
argparse
import
yaml
LANGUAGES
=
{
"bn"
:
{
# Bengali
...
...
@@ -126,6 +127,7 @@ def add_regex_pattern(regex_pattern):
],
}
def
gen_lang_yamls
(
output_dir
:
str
,
overwrite
:
bool
,
mode
:
str
)
->
None
:
"""
Generate a yaml file for each language.
...
...
@@ -158,7 +160,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
task_name
=
f
"mgsm_en_cot_
{
lang
}
"
file_name
=
f
"
{
task_name
}
.yaml"
ANSWER_TO_SKIP
=
len
(
LANGUAGES
[
lang
][
"ANSWER"
])
+
1
ANSWER_TO_SKIP
=
len
(
LANGUAGES
[
lang
][
"ANSWER"
])
+
1
with
open
(
f
"
{
output_dir
}
/
{
file_name
}
"
,
"w"
if
overwrite
else
"x"
,
encoding
=
"utf8"
)
as
f
:
...
...
@@ -181,7 +183,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
**
filter_list
,
"generation_kwargs"
:
{
"until"
:
[
QUESTION
,
"</s>"
,
"<|im_end|>"
],
"do_sample"
:
False
"do_sample"
:
False
,
},
**
({
"target_delimiter"
:
DELIMITER
}
if
DELIMITER
else
{}),
},
...
...
lm_eval/tasks/minerva_math/utils.py
View file @
d27c0c08
import
datasets
import
re
import
signal
from
typing
import
Dict
,
List
,
Optional
import
datasets
from
lm_eval.utils
import
eval_logger
from
typing
import
Optional
,
List
,
Dict
try
:
import
sympy
from
sympy.parsing.latex
import
parse_latex
except
ModuleNotFoundError
:
raise
Exception
(
raise
ModuleNotFoundError
(
"`sympy` is required for generating translation task prompt templates.
\
please install sympy via pip install lm-eval[math] or pip install -e .[math]"
,
)
...
...
lm_eval/tasks/mmlu/_generate_configs.py
View file @
d27c0c08
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import
os
import
yaml
import
argparse
import
os
import
yaml
from
tqdm
import
tqdm
from
lm_eval.logger
import
eval_logger
SUBJECTS
=
{
"abstract_algebra"
:
"stem"
,
"anatomy"
:
"stem"
,
...
...
@@ -124,7 +125,6 @@ if __name__ == "__main__":
yaml
.
dump
(
yaml_dict
,
yaml_file
,
# width=float("inf"),
allow_unicode
=
True
,
default_style
=
'"'
,
)
...
...
lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
View file @
d27c0c08
import
re
import
sys
import
unicodedata
from
lm_eval.filters.extraction
import
RegexFilter
...
...
@@ -10,8 +9,13 @@ class MultiChoiceRegexFilter(RegexFilter):
""" """
def
__init__
(
self
,
regex_pattern
:
str
=
r
"#### (\-?[0-9\.\,]+)"
,
group_select
=
0
,
fallback
:
str
=
"[invalid]"
,
ignore_case
=
False
,
ignore_punctuation
=
False
,
regexes_to_ignore
=
None
,
self
,
regex_pattern
:
str
=
r
"#### (\-?[0-9\.\,]+)"
,
group_select
=
0
,
fallback
:
str
=
"[invalid]"
,
ignore_case
=
False
,
ignore_punctuation
=
False
,
regexes_to_ignore
=
None
,
)
->
None
:
"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
...
...
@@ -44,8 +48,11 @@ class MultiChoiceRegexFilter(RegexFilter):
match
=
convert_dict
[
match
]
return
match
punct_tbl
=
dict
.
fromkeys
(
i
for
i
in
range
(
sys
.
maxunicode
)
if
unicodedata
.
category
(
chr
(
i
)).
startswith
(
'P'
))
punct_tbl
=
dict
.
fromkeys
(
i
for
i
in
range
(
sys
.
maxunicode
)
if
unicodedata
.
category
(
chr
(
i
)).
startswith
(
"P"
)
)
def
filter_ignores
(
st
):
if
self
.
regexes_to_ignore
is
not
None
:
...
...
@@ -65,12 +72,12 @@ class MultiChoiceRegexFilter(RegexFilter):
for
r
,
doc
in
zip
(
resps
,
docs
):
fallback_regexes
=
[]
choice_to_alpha
=
{}
next_alpha
=
'A'
next_alpha
=
"A"
without_paren_fallback_regexes
=
[]
without_paren_to_target
=
{}
choices
=
doc
[
'
choices
'
]
choices
=
doc
[
"
choices
"
]
for
c
in
choices
:
m
=
filter_ignores
(
c
.
strip
())
fallback_regexes
.
append
(
f
"
{
re
.
escape
(
m
)
}
"
)
...
...
@@ -80,17 +87,23 @@ class MultiChoiceRegexFilter(RegexFilter):
without_paren_to_target
[
next_alpha
]
=
f
"(
{
next_alpha
}
)"
next_alpha
=
chr
(
ord
(
next_alpha
)
+
1
)
fallback_regex
=
re
.
compile
(
'|'
.
join
(
fallback_regexes
))
without_paren_fallback_regex
=
'|'
.
join
(
without_paren_fallback_regexes
)
without_paren_fallback_regex
=
re
.
compile
(
f
":[\s]*(
{
without_paren_fallback_regex
}
)"
)
fallback_regex
=
re
.
compile
(
"|"
.
join
(
fallback_regexes
))
without_paren_fallback_regex
=
"|"
.
join
(
without_paren_fallback_regexes
)
without_paren_fallback_regex
=
re
.
compile
(
f
":[\s]*(
{
without_paren_fallback_regex
}
)"
)
filtered
=
[]
for
resp
in
r
:
match
=
find_match
(
self
.
regex
,
resp
)
if
not
match
:
match
=
find_match
(
fallback_regex
,
filter_ignores
(
resp
),
choice_to_alpha
)
match
=
find_match
(
fallback_regex
,
filter_ignores
(
resp
),
choice_to_alpha
)
if
not
match
:
match
=
find_match
(
without_paren_fallback_regex
,
resp
,
without_paren_to_target
)
match
=
find_match
(
without_paren_fallback_regex
,
resp
,
without_paren_to_target
)
if
not
match
:
match
=
self
.
fallback
filtered
.
append
(
match
)
...
...
lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
View file @
d27c0c08
import
re
import
sys
import
unicodedata
from
lm_eval.filters.extraction
import
RegexFilter
...
...
@@ -10,8 +9,13 @@ class MultiChoiceRegexFilter(RegexFilter):
""" """
def
__init__
(
self
,
regex_pattern
:
str
=
r
"#### (\-?[0-9\.\,]+)"
,
group_select
=
0
,
fallback
:
str
=
"[invalid]"
,
ignore_case
=
False
,
ignore_punctuation
=
False
,
regexes_to_ignore
=
None
,
self
,
regex_pattern
:
str
=
r
"#### (\-?[0-9\.\,]+)"
,
group_select
=
0
,
fallback
:
str
=
"[invalid]"
,
ignore_case
=
False
,
ignore_punctuation
=
False
,
regexes_to_ignore
=
None
,
)
->
None
:
"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
...
...
@@ -44,8 +48,11 @@ class MultiChoiceRegexFilter(RegexFilter):
match
=
convert_dict
[
match
]
return
match
punct_tbl
=
dict
.
fromkeys
(
i
for
i
in
range
(
sys
.
maxunicode
)
if
unicodedata
.
category
(
chr
(
i
)).
startswith
(
'P'
))
punct_tbl
=
dict
.
fromkeys
(
i
for
i
in
range
(
sys
.
maxunicode
)
if
unicodedata
.
category
(
chr
(
i
)).
startswith
(
"P"
)
)
def
filter_ignores
(
st
):
if
self
.
regexes_to_ignore
is
not
None
:
...
...
@@ -65,12 +72,12 @@ class MultiChoiceRegexFilter(RegexFilter):
for
r
,
doc
in
zip
(
resps
,
docs
):
fallback_regexes
=
[]
choice_to_alpha
=
{}
next_alpha
=
'A'
next_alpha
=
"A"
without_paren_fallback_regexes
=
[]
without_paren_to_target
=
{}
choices
=
doc
[
'
choices
'
]
choices
=
doc
[
"
choices
"
]
for
c
in
choices
:
m
=
filter_ignores
(
c
.
strip
())
fallback_regexes
.
append
(
f
"
{
re
.
escape
(
m
)
}
"
)
...
...
@@ -80,17 +87,23 @@ class MultiChoiceRegexFilter(RegexFilter):
without_paren_to_target
[
next_alpha
]
=
f
"(
{
next_alpha
}
)"
next_alpha
=
chr
(
ord
(
next_alpha
)
+
1
)
fallback_regex
=
re
.
compile
(
'|'
.
join
(
fallback_regexes
))
without_paren_fallback_regex
=
'|'
.
join
(
without_paren_fallback_regexes
)
without_paren_fallback_regex
=
re
.
compile
(
f
":[\s]*(
{
without_paren_fallback_regex
}
)"
)
fallback_regex
=
re
.
compile
(
"|"
.
join
(
fallback_regexes
))
without_paren_fallback_regex
=
"|"
.
join
(
without_paren_fallback_regexes
)
without_paren_fallback_regex
=
re
.
compile
(
f
":[\s]*(
{
without_paren_fallback_regex
}
)"
)
filtered
=
[]
for
resp
in
r
:
match
=
find_match
(
self
.
regex
,
resp
)
if
not
match
:
match
=
find_match
(
fallback_regex
,
filter_ignores
(
resp
),
choice_to_alpha
)
match
=
find_match
(
fallback_regex
,
filter_ignores
(
resp
),
choice_to_alpha
)
if
not
match
:
match
=
find_match
(
without_paren_fallback_regex
,
resp
,
without_paren_to_target
)
match
=
find_match
(
without_paren_fallback_regex
,
resp
,
without_paren_to_target
)
if
not
match
:
match
=
self
.
fallback
filtered
.
append
(
match
)
...
...
lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
View file @
d27c0c08
import
yaml
import
datasets
import
yaml
from
tqdm
import
tqdm
...
...
lm_eval/tasks/model_written_evals/persona/_generate_configs.py
View file @
d27c0c08
import
yaml
import
datasets
import
yaml
from
tqdm
import
tqdm
...
...
lm_eval/tasks/okapi/arc_multilingual/utils.py
View file @
d27c0c08
import
datasets
import
re
import
datasets
def
preprocess
(
text
):
if
text
is
None
:
...
...
@@ -18,7 +19,13 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
out_doc
=
{
"id"
:
doc
[
"id"
],
"query"
:
"Question: "
+
preprocess
(
doc
[
"instruction"
])
+
"
\n
Answer:"
,
"choices"
:
[
preprocess
(
doc
[
'option_a'
]),
preprocess
(
doc
[
'option_b'
]),
preprocess
(
doc
[
'option_c'
]),
preprocess
(
doc
[
'option_d'
]),
preprocess
(
doc
[
'option_e'
])],
"choices"
:
[
preprocess
(
doc
[
"option_a"
]),
preprocess
(
doc
[
"option_b"
]),
preprocess
(
doc
[
"option_c"
]),
preprocess
(
doc
[
"option_d"
]),
preprocess
(
doc
[
"option_e"
]),
],
"gold"
:
[
"A"
,
"B"
,
"C"
,
"D"
,
"E"
].
index
(
doc
[
"answer"
]),
}
return
out_doc
...
...
lm_eval/tasks/okapi/hellaswag_multilingual/utils.py
View file @
d27c0c08
import
datasets
import
re
import
datasets
def
preprocess
(
text
):
text
=
text
.
strip
()
...
...
lm_eval/tasks/okapi/mmlu_multilingual/_generate_configs.py
View file @
d27c0c08
import
yaml
import
datasets
import
yaml
from
tqdm
import
tqdm
...
...
@@ -10,8 +9,12 @@ def main() -> None:
# Removed hy and sk subdataset because the original dataset is broken
# I created this PR https://huggingface.co/datasets/alexandrainst/m_mmlu/discussions/3
# on the dataset for the authors, in case it will be accepeted the filter can be removed
keys_without_hy_sk
=
list
(
filter
(
lambda
k
:
(
'hy'
not
in
k
and
'sk'
not
in
k
),
datasets
.
get_dataset_infos
(
dataset_path
).
keys
()))
keys_without_hy_sk
=
list
(
# noqa: F841
filter
(
lambda
k
:
(
"hy"
not
in
k
and
"sk"
not
in
k
),
datasets
.
get_dataset_infos
(
dataset_path
).
keys
(),
)
)
for
task
in
tqdm
():
file_name
=
f
"m_mmlu_
{
task
}
.yaml"
...
...
@@ -29,5 +32,6 @@ def main() -> None:
except
FileExistsError
:
pass
if
__name__
==
"__main__"
:
main
()
lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
View file @
d27c0c08
import
datasets
import
re
import
datasets
import
numpy
as
np
QA_PROMPT
=
(
"Q: What is human life expectancy in the United States?
\n
"
"A: Human life expectancy in the United States is 78 years.
\n\n
"
...
...
@@ -17,6 +19,7 @@ QA_PROMPT = (
"A: The 1992 Olympics were held in Barcelona, Spain."
)
def
preprocess
(
text
):
if
text
is
None
:
return
" "
...
...
lm_eval/tasks/qasper/metrics.py
View file @
d27c0c08
import
re
import
string
from
collections
import
Counter
...
...
lm_eval/tasks/qasper/utils.py
View file @
d27c0c08
from
datasets
import
Dataset
from
functools
import
partial
from
datasets
import
Dataset
def
process_docs
(
dataset
,
set_answer_type
=
"bool"
):
FEATURES
=
[
"title"
,
"abstract"
,
"question"
,
"answer"
,
"answer_type"
]
...
...
lm_eval/tasks/realtoxicityprompts/metric.py
View file @
d27c0c08
import
os
import
json
import
requests
import
os
import
numpy
as
np
import
requests
from
lm_eval.utils
import
eval_logger
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment