Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
25869601
Commit
25869601
authored
Oct 19, 2024
by
Baber
Browse files
Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/hf_vlms.py
parents
56f40c53
c1d8795d
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
346 additions
and
0 deletions
+346
-0
lm_eval/tasks/spanish_bench/flores_es/flores_es-gl.yaml
lm_eval/tasks/spanish_bench/flores_es/flores_es-gl.yaml
+7
-0
lm_eval/tasks/spanish_bench/flores_es/flores_es-it.yaml
lm_eval/tasks/spanish_bench/flores_es/flores_es-it.yaml
+7
-0
lm_eval/tasks/spanish_bench/flores_es/flores_es-pt.yaml
lm_eval/tasks/spanish_bench/flores_es/flores_es-pt.yaml
+7
-0
lm_eval/tasks/spanish_bench/flores_es/flores_es.yaml
lm_eval/tasks/spanish_bench/flores_es/flores_es.yaml
+24
-0
lm_eval/tasks/spanish_bench/flores_es/flores_eu-es.yaml
lm_eval/tasks/spanish_bench/flores_es/flores_eu-es.yaml
+7
-0
lm_eval/tasks/spanish_bench/flores_es/flores_fr-es.yaml
lm_eval/tasks/spanish_bench/flores_es/flores_fr-es.yaml
+7
-0
lm_eval/tasks/spanish_bench/flores_es/flores_gl-es.yaml
lm_eval/tasks/spanish_bench/flores_es/flores_gl-es.yaml
+7
-0
lm_eval/tasks/spanish_bench/flores_es/flores_it-es.yaml
lm_eval/tasks/spanish_bench/flores_es/flores_it-es.yaml
+7
-0
lm_eval/tasks/spanish_bench/flores_es/flores_pt-es.yaml
lm_eval/tasks/spanish_bench/flores_es/flores_pt-es.yaml
+7
-0
lm_eval/tasks/spanish_bench/mgsm_direct_es_spanish_bench.yaml
...val/tasks/spanish_bench/mgsm_direct_es_spanish_bench.yaml
+9
-0
lm_eval/tasks/spanish_bench/openbookqa_es.yaml
lm_eval/tasks/spanish_bench/openbookqa_es.yaml
+20
-0
lm_eval/tasks/spanish_bench/paws_es_spanish_bench.yaml
lm_eval/tasks/spanish_bench/paws_es_spanish_bench.yaml
+18
-0
lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common.yaml
...al/tasks/spanish_bench/phrases_es/_phrases_es_common.yaml
+24
-0
lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
+7
-0
lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
+7
-0
lm_eval/tasks/spanish_bench/spanish_bench.yaml
lm_eval/tasks/spanish_bench/spanish_bench.yaml
+17
-0
lm_eval/tasks/spanish_bench/utils.py
lm_eval/tasks/spanish_bench/utils.py
+109
-0
lm_eval/tasks/spanish_bench/wnli_es.yaml
lm_eval/tasks/spanish_bench/wnli_es.yaml
+14
-0
lm_eval/tasks/spanish_bench/xlsum_es.yaml
lm_eval/tasks/spanish_bench/xlsum_es.yaml
+22
-0
lm_eval/tasks/spanish_bench/xnli_es_spanish_bench.yaml
lm_eval/tasks/spanish_bench/xnli_es_spanish_bench.yaml
+19
-0
No files found.
lm_eval/tasks/spanish_bench/flores_es/flores_es-gl.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_flores_common_yaml
task
:
flores_es-gl
doc_to_text
:
'
Spanish
sentence:
{{sentence_spa_Latn}}
Galician
sentence:'
doc_to_target
:
'
{{sentence_glg_Latn}}'
lm_eval/tasks/spanish_bench/flores_es/flores_es-it.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_flores_common_yaml
task
:
flores_es-it
doc_to_text
:
'
Spanish
sentence:
{{sentence_spa_Latn}}
Italian
sentence:'
doc_to_target
:
'
{{sentence_ita_Latn}}'
lm_eval/tasks/spanish_bench/flores_es/flores_es-pt.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_flores_common_yaml
task
:
flores_es-pt
doc_to_text
:
'
Spanish
sentence:
{{sentence_spa_Latn}}
Portuguese
sentence:'
doc_to_target
:
'
{{sentence_por_Latn}}'
lm_eval/tasks/spanish_bench/flores_es/flores_es.yaml
0 → 100644
View file @
25869601
group
:
flores_es
task
:
-
flores_es-en
-
flores_en-es
-
flores_es-eu
-
flores_eu-es
-
flores_es-pt
-
flores_pt-es
-
flores_es-it
-
flores_it-es
-
flores_es-fr
-
flores_fr-es
-
flores_es-ca
-
flores_ca-es
-
flores_es-gl
-
flores_gl-es
-
flores_es-de
-
flores_de-es
aggregate_metric_list
:
-
metric
:
bleu
aggregation
:
mean
weight_by_size
:
false
metadata
:
version
:
1.0
lm_eval/tasks/spanish_bench/flores_es/flores_eu-es.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_flores_common_yaml
task
:
flores_eu-es
doc_to_text
:
'
Basque
sentence:
{{sentence_eus_Latn}}
Spanish
sentence:'
doc_to_target
:
'
{{sentence_spa_Latn}}'
lm_eval/tasks/spanish_bench/flores_es/flores_fr-es.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_flores_common_yaml
task
:
flores_fr-es
doc_to_text
:
'
French
sentence:
{{sentence_fra_Latn}}
Spanish
sentence:'
doc_to_target
:
'
{{sentence_spa_Latn}}'
lm_eval/tasks/spanish_bench/flores_es/flores_gl-es.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_flores_common_yaml
task
:
flores_gl-es
doc_to_text
:
'
Galician
sentence:
{{sentence_glg_Latn}}
Spanish
sentence:'
doc_to_target
:
'
{{sentence_spa_Latn}}'
lm_eval/tasks/spanish_bench/flores_es/flores_it-es.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_flores_common_yaml
task
:
flores_it-es
doc_to_text
:
'
Italian
sentence:
{{sentence_ita_Latn}}
Spanish
sentence:'
doc_to_target
:
'
{{sentence_spa_Latn}}'
lm_eval/tasks/spanish_bench/flores_es/flores_pt-es.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_flores_common_yaml
task
:
flores_pt-es
doc_to_text
:
'
Portuguese
sentence:
{{sentence_por_Latn}}
Spanish
sentence:'
doc_to_target
:
'
{{sentence_spa_Latn}}'
lm_eval/tasks/spanish_bench/mgsm_direct_es_spanish_bench.yaml
0 → 100644
View file @
25869601
include
:
../mgsm/direct/mgsm_direct_es.yaml
doc_to_target
:
'
{{answer_number|string}}'
doc_to_text
:
'
{%
if
answer
is
not
none
%}{{question+"\nRespuesta:
"}}{%
else
%}{{"Pregunta:
"+question+"\nRespuesta:
"}}{%
endif
%}'
generation_kwargs
:
until
:
-
"
\n\n
"
-
"
\n
"
task
:
mgsm_direct_es_spanish_bench
lm_eval/tasks/spanish_bench/openbookqa_es.yaml
0 → 100644
View file @
25869601
task
:
openbookqa_es
dataset_path
:
BSC-LT/openbookqa-es
output_type
:
multiple_choice
training_split
:
null
validation_split
:
validation
test_split
:
test
doc_to_text
:
question_stem
doc_to_target
:
"
{{choices.label.index(answerKey.lstrip())}}"
doc_to_choice
:
"
{{choices.text}}"
should_decontaminate
:
true
doc_to_decontamination_query
:
question_stem
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
1.0
lm_eval/tasks/spanish_bench/paws_es_spanish_bench.yaml
0 → 100644
View file @
25869601
task
:
paws_es_spanish_bench
dataset_path
:
paws-x
dataset_name
:
es
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
test_split
:
test
process_docs
:
!function
utils.process_docs_paraphrases
doc_to_text
:
'
'
doc_to_target
:
label
doc_to_choice
:
'
{{[sentence1+",
¿verdad?
No,
"+sentence2,
sentence1+",
¿verdad?
Sí,
"+sentence2]}}'
target_delimiter
:
'
'
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
1.0
lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common.yaml
0 → 100644
View file @
25869601
tag
:
phrases_es
dataset_path
:
gplsi/ES-VA_translation_test
output_type
:
generate_until
training_split
:
null
validation_split
:
null
test_split
:
test
fewshot_split
:
test
num_fewshot
:
5
target_delimiter
:
'
'
generation_kwargs
:
until
:
-
"
\n
"
metric_list
:
-
metric
:
bleu
aggregation
:
bleu
higher_is_better
:
true
-
metric
:
ter
aggregation
:
ter
higher_is_better
:
false
-
metric
:
chrf
aggregation
:
chrf
higher_is_better
:
true
metadata
:
version
:
1.0
lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_phrases_es_common.yaml
task
:
phrases_es-va
doc_to_text
:
'
Oració
en
espanyol:
{{es}}
Oració
en
valencià:'
doc_to_target
:
'
{{va}}'
lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
0 → 100644
View file @
25869601
# File generated by `create-yamls.py`
include
:
_phrases_es_common.yaml
task
:
phrases_va-es
doc_to_text
:
'
Oració
en
valencià:
{{va}}
Oració
en
espanyol:'
doc_to_target
:
'
{{es}}'
lm_eval/tasks/spanish_bench/spanish_bench.yaml
0 → 100644
View file @
25869601
group
:
spanish_bench
task
:
-
belebele_spa_Latn
-
copa_es
-
escola
-
openbookqa_es
-
wnli_es
-
xnli_es_spanish_bench
-
xstorycloze_es
-
xquad_es
-
xlsum_es
-
paws_es_spanish_bench
-
mgsm_direct_es_spanish_bench
-
flores_es
-
phrases_es
metadata
:
version
:
1.0
lm_eval/tasks/spanish_bench/utils.py
0 → 100644
View file @
25869601
import
re
from
itertools
import
product
import
evaluate
import
transformers.data.metrics.squad_metrics
as
squad_metrics
from
lm_eval.utils
import
general_detokenize
def
lowercase_first_letter
(
text
):
return
text
[
0
].
lower
()
+
text
[
1
:]
def
process_doc_nli
(
dataset
):
def
process_fn
(
doc
):
# Detokenize(remove extra whitespaces)
doc
[
"premise"
]
=
general_detokenize
(
doc
[
"premise"
]).
strip
()
doc
[
"hypothesis"
]
=
general_detokenize
(
doc
[
"hypothesis"
]).
strip
()
# Remove last punctuation mark in the premise
doc
[
"premise"
]
=
(
doc
[
"premise"
][:
-
1
]
if
doc
[
"premise"
].
endswith
((
"."
,
","
,
"!"
,
"?"
))
else
doc
[
"premise"
]
)
# Lowercase the first letter in the hypothesis
doc
[
"hypothesis"
]
=
lowercase_first_letter
(
doc
[
"hypothesis"
])
# Ensure that the hypothesis ends with a dot
doc
[
"hypothesis"
]
=
(
(
doc
[
"hypothesis"
]
+
"."
)
if
not
doc
[
"hypothesis"
].
endswith
(
"."
)
else
doc
[
"hypothesis"
]
)
return
doc
return
dataset
.
map
(
process_fn
)
def
process_results_qa
(
doc
,
results
):
preds
=
results
[
0
]
reference
=
doc
[
"answers"
][
"text"
][
0
]
# import code; code.interact(local=dict(globals(), **locals()))
f1_sum
=
squad_metrics
.
compute_f1
(
reference
,
preds
)
exact_match
=
squad_metrics
.
compute_exact
(
reference
,
preds
)
return
{
"f1"
:
f1_sum
,
"exact_match"
:
exact_match
}
def
process_xlsum
(
dataset
):
def
_process_doc
(
doc
):
# Remove double spaces
doc
[
"text"
]
=
re
.
sub
(
r
" +"
,
" "
,
doc
[
"text"
])
doc
[
"summary"
]
=
re
.
sub
(
r
" +"
,
" "
,
doc
[
"summary"
])
return
doc
return
dataset
.
map
(
_process_doc
)
def
process_docs_paraphrases
(
dataset
):
empty_docs
=
[]
def
_process_doc
(
doc
):
if
doc
[
"sentence1"
]
not
in
[
None
,
""
]
and
doc
[
"sentence2"
]
not
in
[
None
,
""
]:
doc
[
"sentence1"
]
=
general_detokenize
(
doc
[
"sentence1"
]).
strip
()
doc
[
"sentence2"
]
=
general_detokenize
(
doc
[
"sentence2"
]).
strip
()
# Remove final punctuation mark in the first sentence
if
doc
[
"sentence1"
].
endswith
((
"."
,
","
,
";"
)):
doc
[
"sentence1"
]
=
doc
[
"sentence1"
][:
-
1
]
# Start the second sentence in lowercase (to be used after "Yes, ...")
doc
[
"sentence2"
]
=
lowercase_first_letter
(
doc
[
"sentence2"
])
return
doc
else
:
empty_docs
.
append
(
doc
)
return
doc
if
empty_docs
!=
[]:
len_empty_docs
=
len
(
empty_docs
)
print
(
f
"Found
{
len_empty_docs
}
empty documents out of the
{
len
(
dataset
)
}
total docs in the dataset:
{
empty_docs
}
"
)
return
dataset
.
filter
(
lambda
doc
:
doc
[
"sentence1"
]
not
in
[
None
,
""
]
and
doc
[
"sentence2"
]
not
in
[
None
,
""
]
).
map
(
_process_doc
)
def
process_docs_copa_es
(
dataset
):
def
_process_doc
(
doc
):
doc
[
"choice1"
]
=
lowercase_first_letter
(
doc
[
"choice1"
])
doc
[
"choice2"
]
=
lowercase_first_letter
(
doc
[
"choice2"
])
return
doc
return
dataset
.
map
(
_process_doc
)
def
rouge1
(
items
):
"""
# passthrough for efficiency
"""
return
items
def
rouge1_agg
(
items
):
"""
Higher is better
"""
refs
=
list
(
zip
(
*
items
))[
0
]
preds
=
list
(
zip
(
*
items
))[
1
]
rouge_scorer
=
evaluate
.
load
(
"rouge"
)
# import code; code.interact(local=dict(globals(), **locals()))
return
rouge_scorer
.
compute
(
predictions
=
preds
,
references
=
refs
)[
"rouge1"
]
lm_eval/tasks/spanish_bench/wnli_es.yaml
0 → 100644
View file @
25869601
task
:
wnli_es
dataset_path
:
PlanTL-GOB-ES/wnli-es
dataset_name
:
null
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
test_split
:
null
doc_to_text
:
"
{{sentence1}}
\n
Pregunta:
{{sentence2}}
¿Verdadero
o
Falso?
\n
Respuesta:"
doc_to_target
:
label
doc_to_choice
:
[
"
Falso"
,
"
Verdadero"
]
metric_list
:
-
metric
:
acc
metadata
:
version
:
1.0
lm_eval/tasks/spanish_bench/xlsum_es.yaml
0 → 100644
View file @
25869601
task
:
xlsum_es
dataset_path
:
csebuetnlp/xlsum
dataset_name
:
spanish
doc_to_text
:
'
Texto:
{{text}}
Resumen:'
doc_to_target
:
'
{{summary}}'
output_type
:
generate_until
test_split
:
test
training_split
:
train
validation_split
:
validation
fewshot_split
:
train
process_docs
:
!function
utils.process_xlsum
metric_list
:
-
metric
:
bleu
aggregation
:
bleu
higher_is_better
:
true
-
metric
:
!function
utils.rouge1
aggregation
:
!function
utils.rouge1_agg
higher_is_better
:
true
metadata
:
version
:
1.0
lm_eval/tasks/spanish_bench/xnli_es_spanish_bench.yaml
0 → 100644
View file @
25869601
# Task configuration derived from Eleuther AI's implementation as of March 22, 2024, supplemented with an additional preprocessing function
task
:
xnli_es_spanish_bench
dataset_path
:
xnli
dataset_name
:
es
output_type
:
multiple_choice
doc_to_choice
:
'
{{[premise+",
¿correcto?
Sí,
"+hypothesis,premise+",
¿correcto?
Así
que,
"+hypothesis,premise+",
¿correcto?
No,
"+hypothesis]}}'
doc_to_text
:
'
'
target_delimiter
:
'
'
process_docs
:
!function
utils.process_doc_nli
training_split
:
null
validation_split
:
validation
doc_to_target
:
label
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
1.0
Prev
1
…
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment