Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
9822b06e
Unverified
Commit
9822b06e
authored
Mar 01, 2024
by
Lintang Sutawika
Committed by
GitHub
Mar 01, 2024
Browse files
Merge branch 'main' into weight_by_size
parents
51f27158
b177c82c
Changes
656
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
80 additions
and
59 deletions
+80
-59
lm_eval/tasks/kmmlu/kmmlu_patent.yaml
lm_eval/tasks/kmmlu/kmmlu_patent.yaml
+0
-3
lm_eval/tasks/kmmlu/kmmlu_political_science_and_sociology.yaml
...al/tasks/kmmlu/kmmlu_political_science_and_sociology.yaml
+0
-3
lm_eval/tasks/kmmlu/kmmlu_psychology.yaml
lm_eval/tasks/kmmlu/kmmlu_psychology.yaml
+0
-3
lm_eval/tasks/kmmlu/kmmlu_public_safety.yaml
lm_eval/tasks/kmmlu/kmmlu_public_safety.yaml
+0
-3
lm_eval/tasks/kmmlu/kmmlu_railway_and_automotive_engineering.yaml
...tasks/kmmlu/kmmlu_railway_and_automotive_engineering.yaml
+0
-3
lm_eval/tasks/kmmlu/kmmlu_real_estate.yaml
lm_eval/tasks/kmmlu/kmmlu_real_estate.yaml
+0
-3
lm_eval/tasks/kmmlu/kmmlu_refrigerating_machinery.yaml
lm_eval/tasks/kmmlu/kmmlu_refrigerating_machinery.yaml
+0
-3
lm_eval/tasks/kmmlu/kmmlu_social_welfare.yaml
lm_eval/tasks/kmmlu/kmmlu_social_welfare.yaml
+0
-3
lm_eval/tasks/kmmlu/kmmlu_taxation.yaml
lm_eval/tasks/kmmlu/kmmlu_taxation.yaml
+0
-3
lm_eval/tasks/kmmlu/kmmlu_telecommunications_and_wireless_technology.yaml
...mlu/kmmlu_telecommunications_and_wireless_technology.yaml
+0
-3
lm_eval/tasks/kobest/utils.py
lm_eval/tasks/kobest/utils.py
+13
-2
lm_eval/tasks/medmcqa/utils_medmcqa.py
lm_eval/tasks/medmcqa/utils_medmcqa.py
+6
-1
lm_eval/tasks/medqa/preprocess_medqa.py
lm_eval/tasks/medqa/preprocess_medqa.py
+6
-1
lm_eval/tasks/mgsm/direct/direct_yaml
lm_eval/tasks/mgsm/direct/direct_yaml
+7
-1
lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml
lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml
+8
-4
lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml
lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml
+8
-4
lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml
lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml
+8
-4
lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml
lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml
+8
-4
lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml
lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml
+8
-4
lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml
lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml
+8
-4
No files found.
lm_eval/tasks/kmmlu/kmmlu_patent.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Patent"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_patent"
lm_eval/tasks/kmmlu/kmmlu_political_science_and_sociology.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Political-Science-and-Sociology"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_political_science_and_sociology"
lm_eval/tasks/kmmlu/kmmlu_psychology.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Psychology"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_psychology"
lm_eval/tasks/kmmlu/kmmlu_public_safety.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Public-Safety"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_public_safety"
lm_eval/tasks/kmmlu/kmmlu_railway_and_automotive_engineering.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Railway-and-Automotive-Engineering"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_railway_and_automotive_engineering"
lm_eval/tasks/kmmlu/kmmlu_real_estate.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Real-Estate"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_real_estate"
lm_eval/tasks/kmmlu/kmmlu_refrigerating_machinery.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Refrigerating-Machinery"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_refrigerating_machinery"
lm_eval/tasks/kmmlu/kmmlu_social_welfare.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Social-Welfare"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_social_welfare"
lm_eval/tasks/kmmlu/kmmlu_taxation.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Taxation"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_taxation"
lm_eval/tasks/kmmlu/kmmlu_telecommunications_and_wireless_technology.yaml
deleted
100644 → 0
View file @
51f27158
"
dataset_name"
:
"
Telecommunications-and-Wireless-Technology"
"
include"
:
"
_default_kmmlu_yaml"
"
task"
:
"
kmmlu_telecommunications_and_wireless_technology"
lm_eval/tasks/kobest/utils.py
View file @
9822b06e
...
...
@@ -6,32 +6,43 @@ def copa_doc_to_text(doc: dict) -> str:
connector
=
{
"원인"
:
" 왜냐하면"
,
"결과"
:
" 그래서"
}[
doc
[
"question"
].
strip
()]
return
f
"""
{
doc
[
"premise"
]
}
{
connector
}
"""
def
copa_doc_to_target
(
doc
:
dict
)
->
str
:
correct_choice
=
doc
[
"alternative_1"
]
if
doc
[
"label"
]
==
0
else
doc
[
"alternative_2"
]
return
f
"""
{
correct_choice
}
"""
def
copa_doc_to_choice
(
doc
:
dict
)
->
list
:
return
[
f
"""
{
doc
[
"alternative_1"
]
}
"""
,
f
"""
{
doc
[
"alternative_2"
]
}
"""
]
def
sentineg_doc_to_text
(
doc
:
dict
):
return
f
"""문장:
{
doc
[
"sentence"
]
}
긍부정:"""
def
wic_doc_to_text
(
doc
:
dict
)
->
str
:
return
f
"""문장1:
{
doc
[
"context_1"
]
}
문장2:
{
doc
[
"context_2"
]
}
두 문장에서
{
doc
[
"word"
]
}
가 같은 뜻으로 쓰였나?"""
def
hellaswag_process_doc
(
doc
:
Dataset
)
->
Dataset
:
def
preprocessor
(
dataset
):
return
{
"query"
:
f
"""문장:
{
dataset
[
"context"
]
}
"""
,
"choices"
:
[
dataset
[
"ending_1"
],
dataset
[
"ending_2"
],
dataset
[
"ending_3"
],
dataset
[
"ending_4"
]],
"choices"
:
[
dataset
[
"ending_1"
],
dataset
[
"ending_2"
],
dataset
[
"ending_3"
],
dataset
[
"ending_4"
],
],
"gold"
:
int
(
dataset
[
"label"
]),
}
return
doc
.
map
(
preprocessor
)
def
macro_f1_score
(
items
):
unzipped_list
=
list
(
zip
(
*
items
))
golds
=
unzipped_list
[
0
]
preds
=
unzipped_list
[
1
]
fscore
=
f1_score
(
golds
,
preds
,
average
=
'
macro
'
)
fscore
=
f1_score
(
golds
,
preds
,
average
=
"
macro
"
)
return
fscore
lm_eval/tasks/medmcqa/utils_medmcqa.py
View file @
9822b06e
...
...
@@ -10,7 +10,12 @@ def doc_to_text(doc) -> str:
Answer:
"""
choices
=
[
doc
[
"opa"
],
doc
[
"opb"
],
doc
[
"opc"
],
doc
[
"opd"
]]
option_choices
=
{
'A'
:
choices
[
0
],
'B'
:
choices
[
1
],
'C'
:
choices
[
2
],
'D'
:
choices
[
3
]}
option_choices
=
{
"A"
:
choices
[
0
],
"B"
:
choices
[
1
],
"C"
:
choices
[
2
],
"D"
:
choices
[
3
],
}
prompt
=
"Question: "
+
doc
[
"question"
]
+
"
\n
Choices:
\n
"
for
choice
,
option
in
option_choices
.
items
():
...
...
lm_eval/tasks/medqa/preprocess_medqa.py
View file @
9822b06e
def
doc_to_text
(
doc
)
->
str
:
option_choices
=
{
'A'
:
doc
[
"ending0"
],
'B'
:
doc
[
"ending1"
],
'C'
:
doc
[
"ending2"
],
'D'
:
doc
[
"ending3"
]}
option_choices
=
{
"A"
:
doc
[
"ending0"
],
"B"
:
doc
[
"ending1"
],
"C"
:
doc
[
"ending2"
],
"D"
:
doc
[
"ending3"
],
}
answers
=
""
.
join
((
f
"
{
k
}
.
{
v
}
\n
"
)
for
k
,
v
in
option_choices
.
items
())
return
f
"Question:
{
doc
[
'sent1'
]
}
\n
{
answers
}
Answer:"
...
...
lm_eval/tasks/mgsm/direct/direct_yaml
View file @
9822b06e
...
...
@@ -19,6 +19,12 @@ filter_list:
filter:
- function: remove_whitespace
- function: take_first
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
metric_list:
- metric: exact_match
aggregation: mean
...
...
@@ -26,4 +32,4 @@ metric_list:
ignore_case: true
ignore_punctuation: true
metadata:
version:
1
.0
version:
2
.0
lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml
View file @
9822b06e
# Generated by utils.py
dataset_name
:
bn
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[6+1]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAnswer"}}{%
else
%}{{"প্রশ্ন:
"+question+"\nAnswer"}}{%
endif
%}'
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[17:]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAnswer:"}}{%
else
%}{{"প্রশ্ন:
"+question+"\nAnswer:"}}{%
endif
%}'
generation_kwargs
:
do_sample
:
false
until
:
-
'
প্রশ্ন:'
-
</s>
-
<|im_end|>
include
:
direct_yaml
task
:
mgsm_direct_bn
lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml
View file @
9822b06e
# Generated by utils.py
dataset_name
:
de
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[7+1]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAntwort"}}{%
else
%}{{"Frage:
"+question+"\nAntwort"}}{%
endif
%}'
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[29:]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAntwort:"}}{%
else
%}{{"Frage:
"+question+"\nAntwort:"}}{%
endif
%}'
generation_kwargs
:
do_sample
:
false
until
:
-
'
Frage:'
-
</s>
-
<|im_end|>
include
:
direct_yaml
task
:
mgsm_direct_de
lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml
View file @
9822b06e
# Generated by utils.py
dataset_name
:
en
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[6+1]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAnswer"}}{%
else
%}{{"Question:
"+question+"\nAnswer"}}{%
endif
%}'
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[21:]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAnswer:"}}{%
else
%}{{"Question:
"+question+"\nAnswer:"}}{%
endif
%}'
generation_kwargs
:
do_sample
:
false
until
:
-
'
Question:'
-
</s>
-
<|im_end|>
include
:
direct_yaml
task
:
mgsm_direct_en
lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml
View file @
9822b06e
# Generated by utils.py
dataset_name
:
es
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[6+1]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAnswer"}}{%
else
%}{{"Pregunta:
"+question+"\nAnswer"}}{%
endif
%}'
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[23:]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nRespuesta:"}}{%
else
%}{{"Pregunta:
"+question+"\nRespuesta:"}}{%
endif
%}'
generation_kwargs
:
do_sample
:
false
until
:
-
'
Pregunta:'
-
</s>
-
<|im_end|>
include
:
direct_yaml
task
:
mgsm_direct_es
lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml
View file @
9822b06e
# Generated by utils.py
dataset_name
:
fr
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[6+1]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAnswer"}}{%
else
%}{{"Question
:
"+question+"\nAnswer"}}{%
endif
%}'
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[26:]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nRéponse
:"}}{%
else
%}{{"Question
:
"+question+"\nRéponse
:"}}{%
endif
%}'
generation_kwargs
:
do_sample
:
false
until
:
-
'
Question
:'
-
</s>
-
<|im_end|>
include
:
direct_yaml
task
:
mgsm_direct_fr
lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml
View file @
9822b06e
# Generated by utils.py
dataset_name
:
ja
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[6+1]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAnswer"}}{%
else
%}{{"問題:
"+question+"\nAnswer"}}{%
endif
%}'
doc_to_target
:
'
{%
if
answer
is
not
none
%}{{answer[11:]}}{%
else
%}{{answer_number|string}}{%
endif
%}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nAnswer:"}}{%
else
%}{{"問題:
"+question+"\nAnswer:"}}{%
endif
%}'
generation_kwargs
:
do_sample
:
false
until
:
-
'
問題:'
-
</s>
-
<|im_end|>
include
:
direct_yaml
task
:
mgsm_direct_ja
Prev
1
…
18
19
20
21
22
23
24
25
26
…
33
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment