Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
ab96fc7e
Commit
ab96fc7e
authored
Feb 20, 2024
by
lintangsutawika
Browse files
merged with latest update
parents
bf2517cc
8680e938
Changes
128
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
327 additions
and
3 deletions
+327
-3
lm_eval/tasks/haerae/haerae_hi.yaml
lm_eval/tasks/haerae/haerae_hi.yaml
+3
-0
lm_eval/tasks/haerae/haerae_lw.yaml
lm_eval/tasks/haerae/haerae_lw.yaml
+3
-0
lm_eval/tasks/haerae/haerae_rw.yaml
lm_eval/tasks/haerae/haerae_rw.yaml
+3
-0
lm_eval/tasks/haerae/haerae_sn.yaml
lm_eval/tasks/haerae/haerae_sn.yaml
+3
-0
lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
...u/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+12
-2
lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
+99
-0
lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
...lan_n_shot/generative/_mmlu_flan_generative_template_yaml
+15
-1
lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
+99
-0
lm_eval/tasks/okapi/mmlu_multilingual/_default_yaml
lm_eval/tasks/okapi/mmlu_multilingual/_default_yaml
+17
-0
lm_eval/tasks/okapi/mmlu_multilingual/_generate_configs.py
lm_eval/tasks/okapi/mmlu_multilingual/_generate_configs.py
+33
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ar.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ar.yaml
+4
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_bn.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_bn.yaml
+4
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ca.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ca.yaml
+4
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_da.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_da.yaml
+4
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_de.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_de.yaml
+4
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_en.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_en.yaml
+4
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_es.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_es.yaml
+4
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_eu.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_eu.yaml
+4
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_fr.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_fr.yaml
+4
-0
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_gu.yaml
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_gu.yaml
+4
-0
No files found.
lm_eval/tasks/haerae/haerae_hi.yaml
0 → 100644
View file @
ab96fc7e
"
dataset_name"
:
"
history"
"
include"
:
"
_default_haerae_yaml"
"
task"
:
"
haerae_history"
lm_eval/tasks/haerae/haerae_lw.yaml
0 → 100644
View file @
ab96fc7e
"
dataset_name"
:
"
loan_words"
"
include"
:
"
_default_haerae_yaml"
"
task"
:
"
haerae_loan_word"
lm_eval/tasks/haerae/haerae_rw.yaml
0 → 100644
View file @
ab96fc7e
"
dataset_name"
:
"
rare_words"
"
include"
:
"
_default_haerae_yaml"
"
task"
:
"
haerae_rare_word"
lm_eval/tasks/haerae/haerae_sn.yaml
0 → 100644
View file @
ab96fc7e
"
dataset_name"
:
"
standard_nomenclature"
"
include"
:
"
_default_haerae_yaml"
"
task"
:
"
haerae_standard_nomenclature"
lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
View file @
ab96fc7e
...
@@ -5,14 +5,24 @@ output_type: generate_until
...
@@ -5,14 +5,24 @@ output_type: generate_until
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
filter_list:
filter_list:
- name: "
get-answer
"
- name: "
strict-match
"
filter:
filter:
- function: "regex"
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
- function: "take_first"
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
generation_kwargs:
generation_kwargs:
until:
until:
- "</s>"
- "</s>"
- "Q:"
- "<|im_end|>"
do_sample: false
do_sample: false
temperature: 0.0
temperature: 0.0
num_fewshot: 0
num_fewshot: 0
...
@@ -23,4 +33,4 @@ metric_list:
...
@@ -23,4 +33,4 @@ metric_list:
ignore_case: true
ignore_case: true
ignore_punctuation: true
ignore_punctuation: true
metadata:
metadata:
version:
0
.0
version:
1
.0
lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
0 → 100644
View file @
ab96fc7e
import
re
import
sys
import
unicodedata
from
lm_eval.filters.extraction
import
RegexFilter
class
MultiChoiceRegexFilter
(
RegexFilter
):
""" """
def
__init__
(
self
,
regex_pattern
:
str
=
r
"#### (\-?[0-9\.\,]+)"
,
group_select
=
0
,
fallback
:
str
=
"[invalid]"
,
ignore_case
=
False
,
ignore_punctuation
=
False
,
regexes_to_ignore
=
None
,
)
->
None
:
"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
group_select: Selects the (group_select)th match from the findall result.
ignore_case: Ignores the case during step 1 matching
ignore_punctuation: Remove the punctuation during step 1 matching
regexes_to_ignore: Remove these regexes during step 1 matching
"""
super
().
__init__
(
regex_pattern
,
group_select
,
fallback
)
self
.
ignore_case
=
ignore_case
self
.
ignore_punctuation
=
ignore_punctuation
self
.
regexes_to_ignore
=
regexes_to_ignore
def
apply
(
self
,
resps
,
docs
):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
def
find_match
(
regex
,
resp
,
convert_dict
=
{}):
match
=
regex
.
findall
(
resp
)
if
match
:
match
=
match
[
self
.
group_select
]
if
isinstance
(
match
,
tuple
):
match
=
[
m
for
m
in
match
if
m
][
0
]
match
=
match
.
strip
()
if
match
and
match
in
convert_dict
:
match
=
convert_dict
[
match
]
return
match
punct_tbl
=
dict
.
fromkeys
(
i
for
i
in
range
(
sys
.
maxunicode
)
if
unicodedata
.
category
(
chr
(
i
)).
startswith
(
'P'
))
def
filter_ignores
(
st
):
if
self
.
regexes_to_ignore
is
not
None
:
for
s
in
self
.
regexes_to_ignore
:
st
=
re
.
sub
(
s
,
""
,
st
)
if
self
.
ignore_case
:
st
=
st
.
lower
()
if
self
.
ignore_punctuation
:
# https://stackoverflow.com/a/266162
st
=
st
.
translate
(
punct_tbl
)
return
st
filtered_resps
=
[]
for
r
,
doc
in
zip
(
resps
,
docs
):
fallback_regexes
=
[]
choice_to_alpha
=
{}
next_alpha
=
'A'
without_paren_fallback_regexes
=
[]
without_paren_to_target
=
{}
choices
=
doc
[
'choices'
]
for
c
in
choices
:
m
=
filter_ignores
(
c
.
strip
())
fallback_regexes
.
append
(
f
"
{
re
.
escape
(
m
)
}
"
)
choice_to_alpha
[
m
]
=
f
"(
{
next_alpha
}
)"
without_paren_fallback_regexes
.
append
(
next_alpha
)
without_paren_to_target
[
next_alpha
]
=
f
"(
{
next_alpha
}
)"
next_alpha
=
chr
(
ord
(
next_alpha
)
+
1
)
fallback_regex
=
re
.
compile
(
'|'
.
join
(
fallback_regexes
))
without_paren_fallback_regex
=
'|'
.
join
(
without_paren_fallback_regexes
)
without_paren_fallback_regex
=
re
.
compile
(
f
":[\s]*(
{
without_paren_fallback_regex
}
)"
)
filtered
=
[]
for
resp
in
r
:
match
=
find_match
(
self
.
regex
,
resp
)
if
not
match
:
match
=
find_match
(
fallback_regex
,
filter_ignores
(
resp
),
choice_to_alpha
)
if
not
match
:
match
=
find_match
(
without_paren_fallback_regex
,
resp
,
without_paren_to_target
)
if
not
match
:
match
=
self
.
fallback
filtered
.
append
(
match
)
filtered_resps
.
append
(
filtered
)
return
filtered_resps
lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
View file @
ab96fc7e
...
@@ -7,13 +7,27 @@ fewshot_config:
...
@@ -7,13 +7,27 @@ fewshot_config:
output_type: generate_until
output_type: generate_until
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA:"
doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA:"
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
filter_list:
- name: "strict-match"
filter:
- function: "take_first"
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: 0
regex_pattern: "(\\([A-Z]\\))"
ignore_case: true
ignore_punctuation: true
- function: "take_first"
generation_kwargs:
generation_kwargs:
until:
until:
- "</s>"
- "</s>"
- "Q:"
- "<|im_end|>"
- "<0x0A>"
- "<0x0A>"
metric_list:
metric_list:
- metric: exact_match
- metric: exact_match
aggregation: mean
aggregation: mean
higher_is_better: true
higher_is_better: true
metadata:
metadata:
version:
0
.0
version:
1
.0
lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
0 → 100644
View file @
ab96fc7e
import
re
import
sys
import
unicodedata
from
lm_eval.filters.extraction
import
RegexFilter
class
MultiChoiceRegexFilter
(
RegexFilter
):
""" """
def
__init__
(
self
,
regex_pattern
:
str
=
r
"#### (\-?[0-9\.\,]+)"
,
group_select
=
0
,
fallback
:
str
=
"[invalid]"
,
ignore_case
=
False
,
ignore_punctuation
=
False
,
regexes_to_ignore
=
None
,
)
->
None
:
"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
group_select: Selects the (group_select)th match from the findall result.
ignore_case: Ignores the case during step 1 matching
ignore_punctuation: Remove the punctuation during step 1 matching
regexes_to_ignore: Remove these regexes during step 1 matching
"""
super
().
__init__
(
regex_pattern
,
group_select
,
fallback
)
self
.
ignore_case
=
ignore_case
self
.
ignore_punctuation
=
ignore_punctuation
self
.
regexes_to_ignore
=
regexes_to_ignore
def
apply
(
self
,
resps
,
docs
):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
def
find_match
(
regex
,
resp
,
convert_dict
=
{}):
match
=
regex
.
findall
(
resp
)
if
match
:
match
=
match
[
self
.
group_select
]
if
isinstance
(
match
,
tuple
):
match
=
[
m
for
m
in
match
if
m
][
0
]
match
=
match
.
strip
()
if
match
and
match
in
convert_dict
:
match
=
convert_dict
[
match
]
return
match
punct_tbl
=
dict
.
fromkeys
(
i
for
i
in
range
(
sys
.
maxunicode
)
if
unicodedata
.
category
(
chr
(
i
)).
startswith
(
'P'
))
def
filter_ignores
(
st
):
if
self
.
regexes_to_ignore
is
not
None
:
for
s
in
self
.
regexes_to_ignore
:
st
=
re
.
sub
(
s
,
""
,
st
)
if
self
.
ignore_case
:
st
=
st
.
lower
()
if
self
.
ignore_punctuation
:
# https://stackoverflow.com/a/266162
st
=
st
.
translate
(
punct_tbl
)
return
st
filtered_resps
=
[]
for
r
,
doc
in
zip
(
resps
,
docs
):
fallback_regexes
=
[]
choice_to_alpha
=
{}
next_alpha
=
'A'
without_paren_fallback_regexes
=
[]
without_paren_to_target
=
{}
choices
=
doc
[
'choices'
]
for
c
in
choices
:
m
=
filter_ignores
(
c
.
strip
())
fallback_regexes
.
append
(
f
"
{
re
.
escape
(
m
)
}
"
)
choice_to_alpha
[
m
]
=
f
"(
{
next_alpha
}
)"
without_paren_fallback_regexes
.
append
(
next_alpha
)
without_paren_to_target
[
next_alpha
]
=
f
"(
{
next_alpha
}
)"
next_alpha
=
chr
(
ord
(
next_alpha
)
+
1
)
fallback_regex
=
re
.
compile
(
'|'
.
join
(
fallback_regexes
))
without_paren_fallback_regex
=
'|'
.
join
(
without_paren_fallback_regexes
)
without_paren_fallback_regex
=
re
.
compile
(
f
":[\s]*(
{
without_paren_fallback_regex
}
)"
)
filtered
=
[]
for
resp
in
r
:
match
=
find_match
(
self
.
regex
,
resp
)
if
not
match
:
match
=
find_match
(
fallback_regex
,
filter_ignores
(
resp
),
choice_to_alpha
)
if
not
match
:
match
=
find_match
(
without_paren_fallback_regex
,
resp
,
without_paren_to_target
)
if
not
match
:
match
=
self
.
fallback
filtered
.
append
(
match
)
filtered_resps
.
append
(
filtered
)
return
filtered_resps
lm_eval/tasks/okapi/mmlu_multilingual/_default_yaml
0 → 100644
View file @
ab96fc7e
group:
- m_mmlu
dataset_path: alexandrainst/m_mmlu
test_split: test
fewshot_split: train
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
lm_eval/tasks/okapi/mmlu_multilingual/_generate_configs.py
0 → 100644
View file @
ab96fc7e
import
yaml
import
datasets
from
tqdm
import
tqdm
def
main
()
->
None
:
dataset_path
=
"alexandrainst/m_mmlu"
# Removed hy and sk subdataset because the original dataset is broken
# I created this PR https://huggingface.co/datasets/alexandrainst/m_mmlu/discussions/3
# on the dataset for the authors, in case it will be accepeted the filter can be removed
keys_without_hy_sk
=
list
(
filter
(
lambda
k
:
(
'hy'
not
in
k
and
'sk'
not
in
k
),
datasets
.
get_dataset_infos
(
dataset_path
).
keys
()))
for
task
in
tqdm
():
file_name
=
f
"m_mmlu_
{
task
}
.yaml"
try
:
with
open
(
f
"
{
file_name
}
"
,
"w"
)
as
f
:
f
.
write
(
"# Generated by _generate_configs.py
\n
"
)
yaml
.
dump
(
{
"include"
:
"_default_yaml"
,
"task"
:
f
"
{
dataset_path
.
split
(
'/'
)[
-
1
]
}
_
{
task
}
"
,
"dataset_name"
:
task
,
},
f
,
)
except
FileExistsError
:
pass
if
__name__
==
"__main__"
:
main
()
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ar.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
ar
include
:
_default_yaml
task
:
m_mmlu_ar
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_bn.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
bn
include
:
_default_yaml
task
:
m_mmlu_bn
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ca.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
ca
include
:
_default_yaml
task
:
m_mmlu_ca
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_da.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
da
include
:
_default_yaml
task
:
m_mmlu_da
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_de.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
de
include
:
_default_yaml
task
:
m_mmlu_de
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_en.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
en
include
:
_default_yaml
task
:
m_mmlu_en
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_es.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
es
include
:
_default_yaml
task
:
m_mmlu_es
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_eu.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
eu
include
:
_default_yaml
task
:
m_mmlu_eu
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_fr.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
fr
include
:
_default_yaml
task
:
m_mmlu_fr
lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_gu.yaml
0 → 100644
View file @
ab96fc7e
# Generated by _generate_configs.py
dataset_name
:
gu
include
:
_default_yaml
task
:
m_mmlu_gu
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment