Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
5b7ef05f
Commit
5b7ef05f
authored
Oct 14, 2025
by
Baber
Browse files
update mrl subsets
parent
2e8aa9c2
Changes
119
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
137 additions
and
261 deletions
+137
-261
lm_eval/models/openai_completions.py
lm_eval/models/openai_completions.py
+2
-9
lm_eval/tasks/mrl/gen/_generate_config.py
lm_eval/tasks/mrl/gen/_generate_config.py
+2
-3
lm_eval/tasks/mrl/gen/_global_piqa_gen.yaml
lm_eval/tasks/mrl/gen/_global_piqa_gen.yaml
+116
-232
lm_eval/tasks/mrl/gen/acm_arab.yaml
lm_eval/tasks/mrl/gen/acm_arab.yaml
+1
-1
lm_eval/tasks/mrl/gen/acq_arab.yaml
lm_eval/tasks/mrl/gen/acq_arab.yaml
+1
-1
lm_eval/tasks/mrl/gen/aeb_arab.yaml
lm_eval/tasks/mrl/gen/aeb_arab.yaml
+1
-1
lm_eval/tasks/mrl/gen/afb_arab.yaml
lm_eval/tasks/mrl/gen/afb_arab.yaml
+1
-1
lm_eval/tasks/mrl/gen/als_latn.yaml
lm_eval/tasks/mrl/gen/als_latn.yaml
+1
-1
lm_eval/tasks/mrl/gen/amh_ethi.yaml
lm_eval/tasks/mrl/gen/amh_ethi.yaml
+1
-1
lm_eval/tasks/mrl/gen/apc_arab_jord.yaml
lm_eval/tasks/mrl/gen/apc_arab_jord.yaml
+1
-1
lm_eval/tasks/mrl/gen/apc_arab_leba.yaml
lm_eval/tasks/mrl/gen/apc_arab_leba.yaml
+1
-1
lm_eval/tasks/mrl/gen/apc_arab_pale.yaml
lm_eval/tasks/mrl/gen/apc_arab_pale.yaml
+1
-1
lm_eval/tasks/mrl/gen/apc_arab_syri.yaml
lm_eval/tasks/mrl/gen/apc_arab_syri.yaml
+1
-1
lm_eval/tasks/mrl/gen/arb_arab.yaml
lm_eval/tasks/mrl/gen/arb_arab.yaml
+1
-1
lm_eval/tasks/mrl/gen/arq_arab.yaml
lm_eval/tasks/mrl/gen/arq_arab.yaml
+1
-1
lm_eval/tasks/mrl/gen/ars_arab.yaml
lm_eval/tasks/mrl/gen/ars_arab.yaml
+1
-1
lm_eval/tasks/mrl/gen/ary_arab.yaml
lm_eval/tasks/mrl/gen/ary_arab.yaml
+1
-1
lm_eval/tasks/mrl/gen/arz_arab.yaml
lm_eval/tasks/mrl/gen/arz_arab.yaml
+1
-1
lm_eval/tasks/mrl/gen/asm_beng.yaml
lm_eval/tasks/mrl/gen/asm_beng.yaml
+1
-1
lm_eval/tasks/mrl/gen/azj_latn.yaml
lm_eval/tasks/mrl/gen/azj_latn.yaml
+1
-1
No files found.
lm_eval/models/openai_completions.py
View file @
5b7ef05f
...
...
@@ -89,15 +89,7 @@ class LocalCompletionsAPI(TemplateAPI):
@
staticmethod
def
parse_generations
(
outputs
:
Union
[
Dict
,
List
[
Dict
]],
**
kwargs
)
->
List
[
str
]:
res
=
[]
if
not
isinstance
(
outputs
,
list
):
outputs
=
[
outputs
]
for
out
in
outputs
:
tmp
=
[
None
]
*
len
(
out
[
"choices"
])
for
choices
in
out
[
"choices"
]:
tmp
[
choices
[
"index"
]]
=
choices
[
"text"
]
res
=
res
+
tmp
return
res
return
[
""
]
*
len
(
outputs
)
@
property
def
api_key
(
self
):
...
...
@@ -252,6 +244,7 @@ class OpenAIChatCompletion(LocalChatCompletion):
raise
ValueError
(
"API key not found. Please set the `OPENAI_API_KEY` environment variable."
)
return
None
return
key
def
loglikelihood
(
self
,
requests
,
**
kwargs
):
...
...
lm_eval/tasks/mrl/gen/_generate_config.py
View file @
5b7ef05f
...
...
@@ -15,15 +15,14 @@ if __name__ == "__main__":
for
s
in
subsets
:
with
open
(
PARENT
/
f
"
{
s
}
.yaml"
,
"w"
)
as
f
:
f
.
write
(
"include: '_template'
\n
"
)
f
.
write
(
f
"task:
mrl_gen_
{
s
}
\n
"
)
f
.
write
(
f
"task:
{
s
}
\n
"
)
f
.
write
(
f
"dataset_name:
{
s
}
\n
"
)
with
open
(
PARENT
/
"_global_piqa_gen.yaml"
,
"w"
)
as
f
:
f
.
write
(
"group: global_piqa_gen
\n
"
)
f
.
write
(
"task:
\n
"
)
for
s
in
subsets
:
f
.
write
(
f
" - task: mrl_gen_
{
s
}
\n
"
)
f
.
write
(
f
" task_alias:
{
s
}
\n
"
)
f
.
write
(
f
" - task:
{
s
}
\n
"
)
f
.
write
(
"aggregate_metric_list:
\n
"
)
f
.
write
(
" - metric: exact_match
\n
"
)
f
.
write
(
" aggregation: mean
\n
"
)
...
...
lm_eval/tasks/mrl/gen/_global_piqa_gen.yaml
View file @
5b7ef05f
group
:
global_piqa_gen
task
:
-
task
:
mrl_gen_acm_arab
task_alias
:
acm_arab
-
task
:
mrl_gen_acq_arab
task_alias
:
acq_arab
-
task
:
mrl_gen_aeb_arab
task_alias
:
aeb_arab
-
task
:
mrl_gen_afb_arab
task_alias
:
afb_arab
-
task
:
mrl_gen_als_latn
task_alias
:
als_latn
-
task
:
mrl_gen_amh_ethi
task_alias
:
amh_ethi
-
task
:
mrl_gen_apc_arab_jord
task_alias
:
apc_arab_jord
-
task
:
mrl_gen_apc_arab_leba
task_alias
:
apc_arab_leba
-
task
:
mrl_gen_apc_arab_pale
task_alias
:
apc_arab_pale
-
task
:
mrl_gen_apc_arab_syri
task_alias
:
apc_arab_syri
-
task
:
mrl_gen_arb_arab
task_alias
:
arb_arab
-
task
:
mrl_gen_arq_arab
task_alias
:
arq_arab
-
task
:
mrl_gen_ars_arab
task_alias
:
ars_arab
-
task
:
mrl_gen_ary_arab
task_alias
:
ary_arab
-
task
:
mrl_gen_arz_arab
task_alias
:
arz_arab
-
task
:
mrl_gen_asm_beng
task_alias
:
asm_beng
-
task
:
mrl_gen_azj_latn
task_alias
:
azj_latn
-
task
:
mrl_gen_bam_latn
task_alias
:
bam_latn
-
task
:
mrl_gen_bel_cyrl
task_alias
:
bel_cyrl
-
task
:
mrl_gen_ben_beng
task_alias
:
ben_beng
-
task
:
mrl_gen_ben_latn
task_alias
:
ben_latn
-
task
:
mrl_gen_bho_deva
task_alias
:
bho_deva
-
task
:
mrl_gen_bos_latn
task_alias
:
bos_latn
-
task
:
mrl_gen_bsk_arab
task_alias
:
bsk_arab
-
task
:
mrl_gen_bul_cyrl
task_alias
:
bul_cyrl
-
task
:
mrl_gen_cat_latn
task_alias
:
cat_latn
-
task
:
mrl_gen_ces_latn
task_alias
:
ces_latn
-
task
:
mrl_gen_ckb_arab
task_alias
:
ckb_arab
-
task
:
mrl_gen_ckm_latn
task_alias
:
ckm_latn
-
task
:
mrl_gen_cmn_hans
task_alias
:
cmn_hans
-
task
:
mrl_gen_cmn_hant
task_alias
:
cmn_hant
-
task
:
mrl_gen_deu_latn
task_alias
:
deu_latn
-
task
:
mrl_gen_dhd_deva
task_alias
:
dhd_deva
-
task
:
mrl_gen_ekp_latn
task_alias
:
ekp_latn
-
task
:
mrl_gen_ell_grek
task_alias
:
ell_grek
-
task
:
mrl_gen_eng_latn
task_alias
:
eng_latn
-
task
:
mrl_gen_est_latn
task_alias
:
est_latn
-
task
:
mrl_gen_fao_latn
task_alias
:
fao_latn
-
task
:
mrl_gen_fin_latn
task_alias
:
fin_latn
-
task
:
mrl_gen_fra_latn_cana
task_alias
:
fra_latn_cana
-
task
:
mrl_gen_fra_latn_fran
task_alias
:
fra_latn_fran
-
task
:
mrl_gen_glg_latn
task_alias
:
glg_latn
-
task
:
mrl_gen_guj_gujr
task_alias
:
guj_gujr
-
task
:
mrl_gen_hau_latn
task_alias
:
hau_latn
-
task
:
mrl_gen_haw_latn
task_alias
:
haw_latn
-
task
:
mrl_gen_heb_hebr
task_alias
:
heb_hebr
-
task
:
mrl_gen_hin_deva
task_alias
:
hin_deva
-
task
:
mrl_gen_hrv_latn
task_alias
:
hrv_latn
-
task
:
mrl_gen_hun_latn
task_alias
:
hun_latn
-
task
:
mrl_gen_hye_armn
task_alias
:
hye_armn
-
task
:
mrl_gen_ibo_latn
task_alias
:
ibo_latn
-
task
:
mrl_gen_idu_latn
task_alias
:
idu_latn
-
task
:
mrl_gen_ind_latn
task_alias
:
ind_latn
-
task
:
mrl_gen_isl_latn
task_alias
:
isl_latn
-
task
:
mrl_gen_iso_latn
task_alias
:
iso_latn
-
task
:
mrl_gen_ita_latn
task_alias
:
ita_latn
-
task
:
mrl_gen_jav_latn
task_alias
:
jav_latn
-
task
:
mrl_gen_jpn_jpan
task_alias
:
jpn_jpan
-
task
:
mrl_gen_kan_knda
task_alias
:
kan_knda
-
task
:
mrl_gen_kat_geor
task_alias
:
kat_geor
-
task
:
mrl_gen_kaz_cyrl
task_alias
:
kaz_cyrl
-
task
:
mrl_gen_kin_latn
task_alias
:
kin_latn
-
task
:
mrl_gen_kir_cyrl
task_alias
:
kir_cyrl
-
task
:
mrl_gen_kor_hang
task_alias
:
kor_hang
-
task
:
mrl_gen_lin_latn
task_alias
:
lin_latn
-
task
:
mrl_gen_lit_latn
task_alias
:
lit_latn
-
task
:
mrl_gen_luo_latn
task_alias
:
luo_latn
-
task
:
mrl_gen_mal_mlym
task_alias
:
mal_mlym
-
task
:
mrl_gen_mar_deva
task_alias
:
mar_deva
-
task
:
mrl_gen_mkd_cyrl
task_alias
:
mkd_cyrl
-
task
:
mrl_gen_mni_beng
task_alias
:
mni_beng
-
task
:
mrl_gen_mni_mtei
task_alias
:
mni_mtei
-
task
:
mrl_gen_nag_latn
task_alias
:
nag_latn
-
task
:
mrl_gen_nld_latn
task_alias
:
nld_latn
-
task
:
mrl_gen_nno_latn
task_alias
:
nno_latn
-
task
:
mrl_gen_nob_latn
task_alias
:
nob_latn
-
task
:
mrl_gen_npi_deva
task_alias
:
npi_deva
-
task
:
mrl_gen_pan_guru
task_alias
:
pan_guru
-
task
:
mrl_gen_pcm_latn
task_alias
:
pcm_latn
-
task
:
mrl_gen_pes_arab
task_alias
:
pes_arab
-
task
:
mrl_gen_pol_latn
task_alias
:
pol_latn
-
task
:
mrl_gen_por_latn_braz
task_alias
:
por_latn_braz
-
task
:
mrl_gen_por_latn_port
task_alias
:
por_latn_port
-
task
:
mrl_gen_ron_latn
task_alias
:
ron_latn
-
task
:
mrl_gen_rus_cyrl
task_alias
:
rus_cyrl
-
task
:
mrl_gen_rwr_deva
task_alias
:
rwr_deva
-
task
:
mrl_gen_sin_sinh
task_alias
:
sin_sinh
-
task
:
mrl_gen_slk_latn
task_alias
:
slk_latn
-
task
:
mrl_gen_slk_latn_sari
task_alias
:
slk_latn_sari
-
task
:
mrl_gen_slv_latn
task_alias
:
slv_latn
-
task
:
mrl_gen_slv_latn_cerk
task_alias
:
slv_latn_cerk
-
task
:
mrl_gen_snd_arab
task_alias
:
snd_arab
-
task
:
mrl_gen_snd_deva
task_alias
:
snd_deva
-
task
:
mrl_gen_spa_latn_mexi
task_alias
:
spa_latn_mexi
-
task
:
mrl_gen_spa_latn_peru
task_alias
:
spa_latn_peru
-
task
:
mrl_gen_spa_latn_spai
task_alias
:
spa_latn_spai
-
task
:
mrl_gen_srp_cyrl
task_alias
:
srp_cyrl
-
task
:
mrl_gen_srp_latn
task_alias
:
srp_latn
-
task
:
mrl_gen_swe_latn
task_alias
:
swe_latn
-
task
:
mrl_gen_swh_latn
task_alias
:
swh_latn
-
task
:
mrl_gen_tam_taml
task_alias
:
tam_taml
-
task
:
mrl_gen_tel_telu
task_alias
:
tel_telu
-
task
:
mrl_gen_tgl_latn
task_alias
:
tgl_latn
-
task
:
mrl_gen_tha_thai
task_alias
:
tha_thai
-
task
:
mrl_gen_tur_latn
task_alias
:
tur_latn
-
task
:
mrl_gen_uig_arab
task_alias
:
uig_arab
-
task
:
mrl_gen_ukr_cyrl
task_alias
:
ukr_cyrl
-
task
:
mrl_gen_urd_arab
task_alias
:
urd_arab
-
task
:
mrl_gen_urd_latn
task_alias
:
urd_latn
-
task
:
mrl_gen_urh_latn
task_alias
:
urh_latn
-
task
:
mrl_gen_uzn_latn
task_alias
:
uzn_latn
-
task
:
mrl_gen_vie_latn
task_alias
:
vie_latn
-
task
:
mrl_gen_yor_latn
task_alias
:
yor_latn
-
task
:
mrl_gen_yue_hant
task_alias
:
yue_hant
-
task
:
mrl_gen_zsm_latn
task_alias
:
zsm_latn
-
task
:
mrl_gen_zul_latn
task_alias
:
zul_latn
-
task
:
acm_arab
-
task
:
acq_arab
-
task
:
aeb_arab
-
task
:
afb_arab
-
task
:
als_latn
-
task
:
amh_ethi
-
task
:
apc_arab_jord
-
task
:
apc_arab_leba
-
task
:
apc_arab_pale
-
task
:
apc_arab_syri
-
task
:
arb_arab
-
task
:
arq_arab
-
task
:
ars_arab
-
task
:
ary_arab
-
task
:
arz_arab
-
task
:
asm_beng
-
task
:
azj_latn
-
task
:
bam_latn
-
task
:
bel_cyrl
-
task
:
ben_beng
-
task
:
ben_latn
-
task
:
bho_deva
-
task
:
bos_latn
-
task
:
bsk_arab
-
task
:
bul_cyrl
-
task
:
cat_latn
-
task
:
ces_latn
-
task
:
ckb_arab
-
task
:
ckm_latn
-
task
:
cmn_hans
-
task
:
cmn_hant
-
task
:
deu_latn
-
task
:
dhd_deva
-
task
:
ekp_latn
-
task
:
ell_grek
-
task
:
eng_latn
-
task
:
est_latn
-
task
:
fao_latn
-
task
:
fin_latn
-
task
:
fra_latn_cana
-
task
:
fra_latn_fran
-
task
:
glg_latn
-
task
:
guj_gujr
-
task
:
hau_latn
-
task
:
haw_latn
-
task
:
heb_hebr
-
task
:
hin_deva
-
task
:
hrv_latn
-
task
:
hun_latn
-
task
:
hye_armn
-
task
:
ibo_latn
-
task
:
idu_latn
-
task
:
ind_latn
-
task
:
isl_latn
-
task
:
iso_latn
-
task
:
ita_latn
-
task
:
jav_latn
-
task
:
jpn_jpan
-
task
:
kan_knda
-
task
:
kat_geor
-
task
:
kaz_cyrl
-
task
:
kin_latn
-
task
:
kir_cyrl
-
task
:
kor_hang
-
task
:
lin_latn
-
task
:
lit_latn
-
task
:
luo_latn
-
task
:
mal_mlym
-
task
:
mar_deva
-
task
:
mkd_cyrl
-
task
:
mni_beng
-
task
:
mni_mtei
-
task
:
nag_latn
-
task
:
nld_latn
-
task
:
nno_latn
-
task
:
nob_latn
-
task
:
npi_deva
-
task
:
pan_guru
-
task
:
pcm_latn
-
task
:
pes_arab
-
task
:
pol_latn
-
task
:
por_latn_braz
-
task
:
por_latn_port
-
task
:
ron_latn
-
task
:
rus_cyrl
-
task
:
rwr_deva
-
task
:
sin_sinh
-
task
:
slk_latn
-
task
:
slk_latn_sari
-
task
:
slv_latn
-
task
:
slv_latn_cerk
-
task
:
snd_arab
-
task
:
snd_deva
-
task
:
spa_latn_mexi
-
task
:
spa_latn_peru
-
task
:
spa_latn_spai
-
task
:
srp_cyrl
-
task
:
srp_latn
-
task
:
swe_latn
-
task
:
swh_latn
-
task
:
tam_taml
-
task
:
tel_telu
-
task
:
tgl_latn
-
task
:
tha_thai
-
task
:
tur_latn
-
task
:
uig_arab
-
task
:
ukr_cyrl
-
task
:
urd_arab
-
task
:
urd_latn
-
task
:
urh_latn
-
task
:
uzn_latn
-
task
:
vie_latn
-
task
:
yor_latn
-
task
:
yue_hant
-
task
:
zsm_latn
-
task
:
zul_latn
aggregate_metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
lm_eval/tasks/mrl/gen/acm_arab.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
acm_arab
task
:
acm_arab
dataset_name
:
acm_arab
lm_eval/tasks/mrl/gen/acq_arab.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
acq_arab
task
:
acq_arab
dataset_name
:
acq_arab
lm_eval/tasks/mrl/gen/aeb_arab.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
aeb_arab
task
:
aeb_arab
dataset_name
:
aeb_arab
lm_eval/tasks/mrl/gen/afb_arab.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
afb_arab
task
:
afb_arab
dataset_name
:
afb_arab
lm_eval/tasks/mrl/gen/als_latn.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
als_latn
task
:
als_latn
dataset_name
:
als_latn
lm_eval/tasks/mrl/gen/amh_ethi.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
amh_ethi
task
:
amh_ethi
dataset_name
:
amh_ethi
lm_eval/tasks/mrl/gen/apc_arab_jord.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
apc_arab_jord
task
:
apc_arab_jord
dataset_name
:
apc_arab_jord
lm_eval/tasks/mrl/gen/apc_arab_leba.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
apc_arab_leba
task
:
apc_arab_leba
dataset_name
:
apc_arab_leba
lm_eval/tasks/mrl/gen/apc_arab_pale.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
apc_arab_pale
task
:
apc_arab_pale
dataset_name
:
apc_arab_pale
lm_eval/tasks/mrl/gen/apc_arab_syri.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
apc_arab_syri
task
:
apc_arab_syri
dataset_name
:
apc_arab_syri
lm_eval/tasks/mrl/gen/arb_arab.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
arb_arab
task
:
arb_arab
dataset_name
:
arb_arab
lm_eval/tasks/mrl/gen/arq_arab.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
arq_arab
task
:
arq_arab
dataset_name
:
arq_arab
lm_eval/tasks/mrl/gen/ars_arab.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
ars_arab
task
:
ars_arab
dataset_name
:
ars_arab
lm_eval/tasks/mrl/gen/ary_arab.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
ary_arab
task
:
ary_arab
dataset_name
:
ary_arab
lm_eval/tasks/mrl/gen/arz_arab.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
arz_arab
task
:
arz_arab
dataset_name
:
arz_arab
lm_eval/tasks/mrl/gen/asm_beng.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
asm_beng
task
:
asm_beng
dataset_name
:
asm_beng
lm_eval/tasks/mrl/gen/azj_latn.yaml
View file @
5b7ef05f
include
:
'
_template'
task
:
mrl_gen_
azj_latn
task
:
azj_latn
dataset_name
:
azj_latn
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment