Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2b56339e
Commit
2b56339e
authored
Jan 17, 2025
by
Baber
Browse files
Merge branch 'main' into longcxt
parents
0b533339
703fbffd
Changes
316
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
252 additions
and
22 deletions
+252
-22
lm_eval/tasks/mlqa/mlqa_vi_zh.yaml
lm_eval/tasks/mlqa/mlqa_vi_zh.yaml
+5
-0
lm_eval/tasks/mlqa/mlqa_zh_ar.yaml
lm_eval/tasks/mlqa/mlqa_zh_ar.yaml
+5
-0
lm_eval/tasks/mlqa/mlqa_zh_de.yaml
lm_eval/tasks/mlqa/mlqa_zh_de.yaml
+5
-0
lm_eval/tasks/mlqa/mlqa_zh_en.yaml
lm_eval/tasks/mlqa/mlqa_zh_en.yaml
+5
-0
lm_eval/tasks/mlqa/mlqa_zh_es.yaml
lm_eval/tasks/mlqa/mlqa_zh_es.yaml
+5
-0
lm_eval/tasks/mlqa/mlqa_zh_hi.yaml
lm_eval/tasks/mlqa/mlqa_zh_hi.yaml
+5
-0
lm_eval/tasks/mlqa/mlqa_zh_vi.yaml
lm_eval/tasks/mlqa/mlqa_zh_vi.yaml
+5
-0
lm_eval/tasks/mlqa/mlqa_zh_zh.yaml
lm_eval/tasks/mlqa/mlqa_zh_zh.yaml
+5
-0
lm_eval/tasks/mlqa/utils.py
lm_eval/tasks/mlqa/utils.py
+165
-0
lm_eval/tasks/model_written_evals/winogenerated/winogenerated
...val/tasks/model_written_evals/winogenerated/winogenerated
+0
-0
lm_eval/tasks/scrolls/task.py
lm_eval/tasks/scrolls/task.py
+18
-6
lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common
lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common
+0
-0
lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
+1
-1
lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
+1
-1
lm_eval/utils.py
lm_eval/utils.py
+2
-1
scripts/zeno_visualize.py
scripts/zeno_visualize.py
+25
-13
No files found.
lm_eval/tasks/mlqa/mlqa_vi_zh.yaml
0 → 100644
View file @
2b56339e
# Generated by generate_tasks.py
include
:
mlqa_common_yaml
task
:
mlqa_vi_zh
dataset_name
:
mlqa.vi.zh
process_results
:
!function
utils.process_results_vi
lm_eval/tasks/mlqa/mlqa_zh_ar.yaml
0 → 100644
View file @
2b56339e
# Generated by generate_tasks.py
include
:
mlqa_common_yaml
task
:
mlqa_zh_ar
dataset_name
:
mlqa.zh.ar
process_results
:
!function
utils.process_results_zh
lm_eval/tasks/mlqa/mlqa_zh_de.yaml
0 → 100644
View file @
2b56339e
# Generated by generate_tasks.py
include
:
mlqa_common_yaml
task
:
mlqa_zh_de
dataset_name
:
mlqa.zh.de
process_results
:
!function
utils.process_results_zh
lm_eval/tasks/mlqa/mlqa_zh_en.yaml
0 → 100644
View file @
2b56339e
# Generated by generate_tasks.py
include
:
mlqa_common_yaml
task
:
mlqa_zh_en
dataset_name
:
mlqa.zh.en
process_results
:
!function
utils.process_results_zh
lm_eval/tasks/mlqa/mlqa_zh_es.yaml
0 → 100644
View file @
2b56339e
# Generated by generate_tasks.py
include
:
mlqa_common_yaml
task
:
mlqa_zh_es
dataset_name
:
mlqa.zh.es
process_results
:
!function
utils.process_results_zh
lm_eval/tasks/mlqa/mlqa_zh_hi.yaml
0 → 100644
View file @
2b56339e
# Generated by generate_tasks.py
include
:
mlqa_common_yaml
task
:
mlqa_zh_hi
dataset_name
:
mlqa.zh.hi
process_results
:
!function
utils.process_results_zh
lm_eval/tasks/mlqa/mlqa_zh_vi.yaml
0 → 100644
View file @
2b56339e
# Generated by generate_tasks.py
include
:
mlqa_common_yaml
task
:
mlqa_zh_vi
dataset_name
:
mlqa.zh.vi
process_results
:
!function
utils.process_results_zh
lm_eval/tasks/mlqa/mlqa_zh_zh.yaml
0 → 100644
View file @
2b56339e
# Generated by generate_tasks.py
include
:
mlqa_common_yaml
task
:
mlqa_zh_zh
dataset_name
:
mlqa.zh.zh
process_results
:
!function
utils.process_results_zh
lm_eval/tasks/mlqa/utils.py
0 → 100644
View file @
2b56339e
"""
Code based on Official evaluation script for the MLQA dataset.
Repo: https://github.com/facebookresearch/MLQA/blob/main/mlqa_evaluation_v1.py
"""
import
re
import
string
import
sys
import
unicodedata
from
collections
import
Counter
import
datasets
PUNCT
=
{
chr
(
i
)
for
i
in
range
(
sys
.
maxunicode
)
if
unicodedata
.
category
(
chr
(
i
)).
startswith
(
"P"
)
}.
union
(
string
.
punctuation
)
WHITESPACE_LANGS
=
[
"en"
,
"es"
,
"hi"
,
"vi"
,
"de"
,
"ar"
]
MIXED_SEGMENTATION_LANGS
=
[
"zh"
]
def
whitespace_tokenize
(
text
):
return
text
.
split
()
def
mixed_segmentation
(
text
):
segs_out
=
[]
temp_str
=
""
for
char
in
text
:
if
re
.
search
(
r
"[\u4e00-\u9fa5]"
,
char
)
or
char
in
PUNCT
:
if
temp_str
!=
""
:
ss
=
whitespace_tokenize
(
temp_str
)
segs_out
.
extend
(
ss
)
temp_str
=
""
segs_out
.
append
(
char
)
else
:
temp_str
+=
char
if
temp_str
!=
""
:
ss
=
whitespace_tokenize
(
temp_str
)
segs_out
.
extend
(
ss
)
return
segs_out
def
normalize_answer
(
s
,
lang
):
"""Lower text and remove punctuation, articles and extra whitespace."""
def
remove_articles
(
text
,
lang
):
if
lang
==
"en"
:
return
re
.
sub
(
r
"\b(a|an|the)\b"
,
" "
,
text
)
elif
lang
==
"es"
:
return
re
.
sub
(
r
"\b(un|una|unos|unas|el|la|los|las)\b"
,
" "
,
text
)
elif
lang
==
"hi"
:
return
text
# Hindi does not have formal articles
elif
lang
==
"vi"
:
return
re
.
sub
(
r
"\b(của|là|cái|chiếc|những)\b"
,
" "
,
text
)
elif
lang
==
"de"
:
return
re
.
sub
(
r
"\b(ein|eine|einen|einem|eines|einer|der|die|das|den|dem|des)\b"
,
" "
,
text
,
)
elif
lang
==
"ar"
:
return
re
.
sub
(
r
"\sال^|ال"
,
" "
,
text
)
elif
lang
==
"zh"
:
return
text
# Chinese does not have formal articles
else
:
raise
Exception
(
"Unknown Language {}"
.
format
(
lang
))
def
white_space_fix
(
text
,
lang
):
if
lang
in
WHITESPACE_LANGS
:
tokens
=
whitespace_tokenize
(
text
)
elif
lang
in
MIXED_SEGMENTATION_LANGS
:
tokens
=
mixed_segmentation
(
text
)
else
:
raise
Exception
(
"Unknown Language {}"
.
format
(
lang
))
return
" "
.
join
([
t
for
t
in
tokens
if
t
.
strip
()
!=
""
])
def
remove_punc
(
text
):
return
""
.
join
(
ch
for
ch
in
text
if
ch
not
in
PUNCT
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_articles
(
remove_punc
(
lower
(
s
)),
lang
),
lang
)
def
f1_score
(
prediction
,
ground_truth
,
lang
):
prediction_tokens
=
normalize_answer
(
prediction
,
lang
).
split
()
ground_truth_tokens
=
normalize_answer
(
ground_truth
,
lang
).
split
()
common
=
Counter
(
prediction_tokens
)
&
Counter
(
ground_truth_tokens
)
num_same
=
sum
(
common
.
values
())
if
num_same
==
0
:
return
0
precision
=
1.0
*
num_same
/
len
(
prediction_tokens
)
recall
=
1.0
*
num_same
/
len
(
ground_truth_tokens
)
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
def
exact_match_score
(
prediction
,
ground_truth
,
lang
):
return
normalize_answer
(
prediction
,
lang
)
==
normalize_answer
(
ground_truth
,
lang
)
def
metric_max_over_ground_truths
(
metric_fn
,
prediction
,
ground_truths
,
lang
):
scores_for_ground_truths
=
[]
for
ground_truth
in
ground_truths
:
score
=
metric_fn
(
prediction
,
ground_truth
,
lang
)
scores_for_ground_truths
.
append
(
score
)
return
max
(
scores_for_ground_truths
)
def
process_docs
(
dataset
:
datasets
.
Dataset
)
->
datasets
.
Dataset
:
def
_process_doc
(
doc
):
out_doc
=
{
"context"
:
doc
[
"context"
],
"question"
:
doc
[
"question"
],
"answers"
:
doc
[
"answers"
][
"text"
],
}
return
out_doc
return
dataset
.
map
(
_process_doc
)
# Base function
def
process_results_lang
(
doc
,
results
,
lang
):
ground_truths
=
doc
[
"answers"
]
prediction
=
results
[
0
].
strip
()
exact_match
=
metric_max_over_ground_truths
(
exact_match_score
,
prediction
,
ground_truths
,
lang
)
f1
=
metric_max_over_ground_truths
(
f1_score
,
prediction
,
ground_truths
,
lang
)
return
{
"exact_match"
:
exact_match
,
"f1"
:
f1
}
# Language Wrapper functions
def
process_results_en
(
doc
,
results
):
return
process_results_lang
(
doc
,
results
,
"en"
)
def
process_results_es
(
doc
,
results
):
return
process_results_lang
(
doc
,
results
,
"es"
)
def
process_results_hi
(
doc
,
results
):
return
process_results_lang
(
doc
,
results
,
"hi"
)
def
process_results_vi
(
doc
,
results
):
return
process_results_lang
(
doc
,
results
,
"vi"
)
def
process_results_de
(
doc
,
results
):
return
process_results_lang
(
doc
,
results
,
"de"
)
def
process_results_ar
(
doc
,
results
):
return
process_results_lang
(
doc
,
results
,
"ar"
)
def
process_results_zh
(
doc
,
results
):
return
process_results_lang
(
doc
,
results
,
"zh"
)
lm_eval/tasks/model_written_evals/winogenerated/winogenerated
.yaml
→
lm_eval/tasks/model_written_evals/winogenerated/winogenerated
View file @
2b56339e
File moved
lm_eval/tasks/scrolls/task.py
View file @
2b56339e
...
...
@@ -4,7 +4,8 @@ from functools import reduce
import
numpy
as
np
import
transformers.data.metrics.squad_metrics
as
squad_metrics
from
datasets
import
Dataset
,
load_metric
from
datasets
import
Dataset
from
evaluate
import
load
from
transformers
import
AutoTokenizer
from
lm_eval.api.instance
import
Instance
...
...
@@ -48,7 +49,10 @@ def _download_metric():
from
huggingface_hub
import
hf_hub_download
scrolls_metric_path
=
hf_hub_download
(
repo_id
=
"tau/scrolls"
,
repo_type
=
"dataset"
,
filename
=
"metrics/scrolls.py"
repo_id
=
"tau/scrolls"
,
repo_type
=
"dataset"
,
filename
=
"metrics/scrolls.py"
,
revision
=
"refs/pr/5"
,
)
updated_scrolls_metric_path
=
(
os
.
path
.
dirname
(
scrolls_metric_path
)
...
...
@@ -119,7 +123,7 @@ class _SCROLLSTask(ConfigurableTask):
def
__init__
(
self
,
config
=
None
):
super
().
__init__
(
config
=
{
"metadata"
:
{
"version"
:
self
.
VERSION
}})
if
self
.
DATASET_NAME
is
not
None
:
self
.
metric
=
load
_metric
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
self
.
metric
=
load
(
_download_metric
(),
config_name
=
self
.
DATASET_NAME
)
def
has_training_docs
(
self
):
return
True
...
...
@@ -253,11 +257,14 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
apply_chat_template
=
kwargs
.
pop
(
"apply_chat_template"
,
False
)
request_list
=
[
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" {}"
.
format
(
choice
)),
arguments
=
(
ctx
,
" {}"
.
format
(
choice
))
if
not
apply_chat_template
else
(
ctx
,
"{}"
.
format
(
choice
)),
idx
=
i
,
**
kwargs
,
)
...
...
@@ -285,6 +292,7 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
kwargs
.
pop
(
"apply_chat_template"
,
False
)
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
...
...
@@ -327,19 +335,22 @@ class Qasper(_SCROLLSTask):
return
{
"f1"
:
(
prediction
,
doc
[
"outputs"
])}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
apply_chat_template
=
kwargs
.
pop
(
"apply_chat_template"
,
False
)
if
doc
[
"is_yes_no"
]:
return
[
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" yes"
),
arguments
=
(
ctx
,
" yes"
)
if
not
apply_chat_template
else
(
ctx
,
"yes"
),
idx
=
0
,
**
kwargs
,
),
Instance
(
request_type
=
"loglikelihood"
,
doc
=
doc
,
arguments
=
(
ctx
,
" no"
),
arguments
=
(
ctx
,
" no"
)
if
not
apply_chat_template
else
(
ctx
,
"no"
)
,
idx
=
1
,
**
kwargs
,
),
...
...
@@ -406,6 +417,7 @@ class NarrativeQA(_SCROLLSTask):
return
{
"f1"
:
(
results
[
0
],
doc
[
"outputs"
])}
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
kwargs
.
pop
(
"apply_chat_template"
,
False
)
return
Instance
(
request_type
=
"generate_until"
,
doc
=
doc
,
...
...
lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common
.yaml
→
lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common
View file @
2b56339e
File moved
lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
View file @
2b56339e
# File generated by `create-yamls.py`
include
:
_phrases_es_common
.yaml
include
:
_phrases_es_common
task
:
phrases_es-va
doc_to_text
:
'
Oració
en
espanyol:
{{es}}
...
...
lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
View file @
2b56339e
# File generated by `create-yamls.py`
include
:
_phrases_es_common
.yaml
include
:
_phrases_es_common
task
:
phrases_va-es
doc_to_text
:
'
Oració
en
valencià:
{{va}}
...
...
lm_eval/utils.py
View file @
2b56339e
...
...
@@ -104,7 +104,8 @@ def simple_parse_args_string(args_string):
return
{}
arg_list
=
[
arg
for
arg
in
args_string
.
split
(
","
)
if
arg
]
args_dict
=
{
k
:
handle_arg_string
(
v
)
for
k
,
v
in
[
arg
.
split
(
"="
)
for
arg
in
arg_list
]
kv
[
0
]:
handle_arg_string
(
"="
.
join
(
kv
[
1
:]))
for
kv
in
[
arg
.
split
(
"="
)
for
arg
in
arg_list
]
}
return
args_dict
...
...
scripts/zeno_visualize.py
View file @
2b56339e
...
...
@@ -109,13 +109,14 @@ def main():
if
model_index
==
0
:
# Only need to assemble data for the first model
metrics
=
[]
for
metric
in
config
[
"metric_list"
]:
metrics
.
append
(
ZenoMetric
(
name
=
metric
[
"metric"
],
type
=
"mean"
,
columns
=
[
metric
[
"metric"
]],
if
metric
.
get
(
"aggregation"
)
==
"mean"
:
metrics
.
append
(
ZenoMetric
(
name
=
metric
[
"metric"
],
type
=
"mean"
,
columns
=
[
metric
[
"metric"
]],
)
)
)
project
=
client
.
create_project
(
name
=
args
.
project_name
+
(
f
"_
{
task
}
"
if
len
(
tasks
)
>
1
else
""
),
view
=
"text-classification"
,
...
...
@@ -168,7 +169,11 @@ def generate_dataset(
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
"""
ids
=
[
x
[
"doc_id"
]
for
x
in
data
]
ids
=
(
[
x
[
"doc_id"
]
for
x
in
data
]
if
not
config
.
get
(
"filter_list"
)
else
[
f
"
{
x
[
'doc_id'
]
}
.
{
x
[
'filter'
]
}
"
for
x
in
data
]
)
labels
=
[
x
[
"target"
]
for
x
in
data
]
instance
=
[
""
]
*
len
(
ids
)
...
...
@@ -190,6 +195,7 @@ def generate_dataset(
return
pd
.
DataFrame
(
{
"id"
:
ids
,
"doc_id"
:
[
x
[
"doc_id"
]
for
x
in
data
],
"data"
:
instance
,
"input_len"
:
[
len
(
x
)
for
x
in
instance
],
"labels"
:
labels
,
...
...
@@ -208,8 +214,15 @@ def generate_system_df(data, config):
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
"""
ids
=
[
x
[
"doc_id"
]
for
x
in
data
]
ids
=
(
[
x
[
"doc_id"
]
for
x
in
data
]
if
not
config
.
get
(
"filter_list"
)
else
[
f
"
{
x
[
'doc_id'
]
}
.
{
x
[
'filter'
]
}
"
for
x
in
data
]
)
system_dict
=
{
"id"
:
ids
}
system_dict
[
"doc_id"
]
=
[
x
[
"doc_id"
]
for
x
in
data
]
if
config
.
get
(
"filter_list"
):
system_dict
[
"filter"
]
=
[
x
[
"filter"
]
for
x
in
data
]
system_dict
[
"output"
]
=
[
""
]
*
len
(
ids
)
if
config
[
"output_type"
]
==
"loglikelihood"
:
...
...
@@ -228,11 +241,10 @@ def generate_system_df(data, config):
system_dict
[
"output"
]
=
[
str
(
x
[
"filtered_resps"
][
0
])
for
x
in
data
]
system_dict
[
"output_length"
]
=
[
len
(
str
(
x
[
"filtered_resps"
][
0
]))
for
x
in
data
]
metrics
=
{}
for
metric
in
config
[
"metric_list"
]:
if
"aggregation"
in
metric
and
metric
[
"aggregation"
]
==
"mean"
:
metrics
[
metric
[
"metric"
]]
=
[
x
[
metric
[
"metric"
]]
for
x
in
data
]
metrics
=
{
metric
[
"metric"
]:
[
x
[
metric
[
"metric"
]]
for
x
in
data
]
for
metric
in
config
[
"metric_list"
]
}
system_dict
.
update
(
metrics
)
system_df
=
pd
.
DataFrame
(
system_dict
)
return
system_df
...
...
Prev
1
…
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment