Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2ebef470
Commit
2ebef470
authored
Jul 03, 2025
by
Baber
Browse files
Merge branch 'main' into feature/eval_from_config
# Conflicts: # lm_eval/__main__.py
parents
d816f64a
ff41a856
Changes
31
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
293 additions
and
21 deletions
+293
-21
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_es.yaml
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_es.yaml
+3
-0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_eu.yaml
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_eu.yaml
+3
-0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_gl.yaml
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_gl.yaml
+3
-0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_ca.yaml
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_ca.yaml
+12
-0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_en.yaml
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_en.yaml
+12
-0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_es.yaml
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_es.yaml
+12
-0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_eu.yaml
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_eu.yaml
+12
-0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_gl.yaml
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_gl.yaml
+12
-0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc_common
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc_common
+21
-0
lm_eval/tasks/truthfulqa-multi/utils.py
lm_eval/tasks/truthfulqa-multi/utils.py
+199
-0
lm_eval/tasks/unitxt/task.py
lm_eval/tasks/unitxt/task.py
+4
-21
No files found.
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_es.yaml
0 → 100644
View file @
2ebef470
include
:
truthfulqa-multi_mc_common
task
:
truthfulqa-multi_mc1_es
dataset_name
:
es
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_eu.yaml
0 → 100644
View file @
2ebef470
include
:
truthfulqa-multi_mc_common
task
:
truthfulqa-multi_mc1_eu
dataset_name
:
eu
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_gl.yaml
0 → 100644
View file @
2ebef470
include
:
truthfulqa-multi_mc_common
task
:
truthfulqa-multi_mc1_gl
dataset_name
:
gl
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_ca.yaml
0 → 100644
View file @
2ebef470
include
:
truthfulqa-multi_mc1_ca.yaml
task
:
truthfulqa-multi_mc2_ca
doc_to_choice
:
"
{{mc2_targets.choices}}"
process_results
:
!function
utils.process_results_mc2
should_decontaminate
:
True
doc_to_decontamination_query
:
question
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
2.0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_en.yaml
0 → 100644
View file @
2ebef470
include
:
truthfulqa-multi_mc1_en.yaml
task
:
truthfulqa-multi_mc2_en
doc_to_choice
:
"
{{mc2_targets.choices}}"
process_results
:
!function
utils.process_results_mc2
should_decontaminate
:
True
doc_to_decontamination_query
:
question
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
2.0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_es.yaml
0 → 100644
View file @
2ebef470
include
:
truthfulqa-multi_mc1_es.yaml
task
:
truthfulqa-multi_mc2_es
doc_to_choice
:
"
{{mc2_targets.choices}}"
process_results
:
!function
utils.process_results_mc2
should_decontaminate
:
True
doc_to_decontamination_query
:
question
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
2.0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_eu.yaml
0 → 100644
View file @
2ebef470
include
:
truthfulqa-multi_mc1_eu.yaml
task
:
truthfulqa-multi_mc2_eu
doc_to_choice
:
"
{{mc2_targets.choices}}"
process_results
:
!function
utils.process_results_mc2
should_decontaminate
:
True
doc_to_decontamination_query
:
question
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
2.0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc2_gl.yaml
0 → 100644
View file @
2ebef470
include
:
truthfulqa-multi_mc1_gl.yaml
task
:
truthfulqa-multi_mc2_gl
doc_to_choice
:
"
{{mc2_targets.choices}}"
process_results
:
!function
utils.process_results_mc2
should_decontaminate
:
True
doc_to_decontamination_query
:
question
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
2.0
lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc_common
0 → 100644
View file @
2ebef470
tag:
- truthfulqa-multi
dataset_path: HiTZ/truthfulqa-multi
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: null
fewshot_split: train
fewshot_config:
sampler: first_n
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 2.0
doc_to_text: "{{'Q: ' + question + '\nA:'}}"
lm_eval/tasks/truthfulqa-multi/utils.py
0 → 100644
View file @
2ebef470
import
logging
import
datasets
import
numpy
as
np
logger
=
logging
.
getLogger
(
__name__
)
try
:
import
sacrebleu
from
rouge_score
import
rouge_scorer
,
scoring
except
ImportError
as
e
:
raise
type
(
e
)(
"Required packages not installed. Please install the required packages via `pip install rouge_score sacrebleu`"
)
def
process_results_mc2
(
doc
,
results
):
lls
,
is_greedy
=
zip
(
*
results
)
# Split on the first `0` as everything before it is true (`1`).
split_idx
=
list
(
doc
[
"mc2_targets"
][
"labels"
]).
index
(
0
)
# Compute the normalized probability mass for the correct answer.
ll_true
,
ll_false
=
lls
[:
split_idx
],
lls
[
split_idx
:]
p_true
,
p_false
=
np
.
exp
(
np
.
array
(
ll_true
)),
np
.
exp
(
np
.
array
(
ll_false
))
p_true
=
p_true
/
(
sum
(
p_true
)
+
sum
(
p_false
))
accuracy
=
sum
(
p_true
)
# TODO. alternatively
# new = (sum(p_true)/p_true) / ((sum(p_true)/p_true) + (sum(p_false)/p_false))
return
{
"acc"
:
accuracy
}
# , "mc2_new":new}
def
process_docs_gen
(
dataset
:
datasets
.
Dataset
)
->
datasets
.
Dataset
:
return
dataset
.
map
(
preprocess_function
)
def
preprocess_function
(
examples
):
def
_format_answers
(
answers
):
formatted_answers
=
[]
for
answer
in
answers
:
answer
=
answer
.
strip
()
if
len
(
answer
):
# Add a period after all answers.
if
answer
[
-
1
]
!=
"."
:
formatted_answers
.
append
(
answer
+
"."
)
else
:
formatted_answers
.
append
(
answer
)
return
formatted_answers
if
"correct_answers"
in
examples
:
incorrect_answers
=
_format_answers
(
examples
[
"incorrect_answers"
])
correct_answers
=
_format_answers
(
examples
[
"correct_answers"
])
if
examples
[
"lang"
]
==
"en"
and
"I have no comment."
not
in
correct_answers
:
correct_answers
.
append
(
"I have no comment."
)
elif
(
examples
[
"lang"
]
==
"ca"
and
"No tinc cap comentari."
not
in
correct_answers
):
correct_answers
.
append
(
"No tinc cap comentari."
)
elif
(
examples
[
"lang"
]
==
"es"
and
"No tengo ningún comentario."
not
in
correct_answers
):
correct_answers
.
append
(
"No tengo ningún comentario."
)
elif
examples
[
"lang"
]
==
"eu"
and
"Iruzkinik ez."
not
in
correct_answers
:
correct_answers
.
append
(
"Iruzkinik ez."
)
elif
(
examples
[
"lang"
]
==
"gl"
and
"Non teño ningún comentario."
not
in
correct_answers
):
correct_answers
.
append
(
"Non teño ningún comentario."
)
return
{
"question"
:
examples
[
"question"
].
strip
(),
"correct_answers"
:
correct_answers
,
"incorrect_answers"
:
incorrect_answers
,
"best_answer"
:
examples
[
"best_answer"
],
}
def
process_results_gen
(
doc
,
results
):
completion
=
results
[
0
]
true_refs
,
false_refs
=
doc
[
"correct_answers"
],
doc
[
"incorrect_answers"
]
all_refs
=
true_refs
+
false_refs
# Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
# # BLEURT
# bleurt_scores_true = self.bleurt.compute(
# predictions=[completion] * len(true_refs), references=true_refs
# )["scores"]
# bleurt_scores_false = self.bleurt.compute(
# predictions=[completion] * len(false_refs), references=false_refs
# )["scores"]
# bleurt_correct = max(bleurt_scores_true)
# bleurt_incorrect = max(bleurt_scores_false)
# bleurt_max = bleurt_correct
# bleurt_diff = bleurt_correct - bleurt_incorrect
# bleurt_acc = int(bleurt_correct > bleurt_incorrect)
# BLEU
bleu_scores
=
[
bleu
([[
ref
]],
[
completion
])
for
ref
in
all_refs
]
bleu_correct
=
np
.
nanmax
(
bleu_scores
[:
len
(
true_refs
)])
bleu_incorrect
=
np
.
nanmax
(
bleu_scores
[
len
(
true_refs
)
:])
bleu_max
=
bleu_correct
bleu_diff
=
bleu_correct
-
bleu_incorrect
bleu_acc
=
int
(
bleu_correct
>
bleu_incorrect
)
# ROUGE-N
# rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
# # ROUGE-1
# rouge1_scores = [score["rouge1"] for score in rouge_scores]
# rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
# rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
# rouge1_max = rouge1_correct
# rouge1_diff = rouge1_correct - rouge1_incorrect
# rouge1_acc = int(rouge1_correct > rouge1_incorrect)
# # ROUGE-2
# rouge2_scores = [score["rouge2"] for score in rouge_scores]
# rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
# rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
# rouge2_max = rouge2_correct
# rouge2_diff = rouge2_correct - rouge2_incorrect
# rouge2_acc = int(rouge2_correct > rouge2_incorrect)
# # ROUGE-L
# rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
# rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
# rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
# rougeL_max = rougeL_correct
# rougeL_diff = rougeL_correct - rougeL_incorrect
# rougeL_acc = int(rougeL_correct > rougeL_incorrect)
return
{
# "bleurt_max": bleurt_max,
# "bleurt_acc": bleurt_acc,
# "bleurt_diff": bleurt_diff,
"bleu_max"
:
bleu_max
,
"bleu_acc"
:
bleu_acc
,
"bleu_diff"
:
bleu_diff
,
# "rouge1_max": rouge1_max,
# "rouge1_acc": rouge1_acc,
# "rouge1_diff": rouge1_diff,
# "rouge2_max": rouge2_max,
# "rouge2_acc": rouge2_acc,
# "rouge2_diff": rouge2_diff,
# "rougeL_max": rougeL_max,
# "rougeL_acc": rougeL_acc,
# "rougeL_diff": rougeL_diff,
}
def
bleu
(
refs
,
preds
):
"""
Returns `t5` style BLEU scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
:param refs:
A `list` of `list` of reference `str`s.
:param preds:
A `list` of predicted `str`s.
"""
score
=
sacrebleu
.
corpus_bleu
(
preds
,
refs
,
smooth_method
=
"exp"
,
smooth_value
=
0.0
,
force
=
False
,
lowercase
=
False
,
tokenize
=
"intl"
,
use_effective_order
=
False
,
).
score
return
score
def
rouge
(
refs
,
preds
):
"""
Returns `t5` style ROUGE scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
:param refs:
A `list` of reference `strs`.
:param preds:
A `list` of predicted `strs`.
"""
rouge_types
=
[
"rouge1"
,
"rouge2"
,
"rougeLsum"
]
scorer
=
rouge_scorer
.
RougeScorer
(
rouge_types
)
# Add newlines between sentences to correctly compute `rougeLsum`.
def
_prepare_summary
(
summary
):
summary
=
summary
.
replace
(
" . "
,
".
\n
"
)
return
summary
# Accumulate confidence intervals.
aggregator
=
scoring
.
BootstrapAggregator
()
for
ref
,
pred
in
zip
(
refs
,
preds
):
ref
=
_prepare_summary
(
ref
)
pred
=
_prepare_summary
(
pred
)
aggregator
.
add_scores
(
scorer
.
score
(
ref
,
pred
))
result
=
aggregator
.
aggregate
()
return
{
type
:
result
[
type
].
mid
.
fmeasure
*
100
for
type
in
rouge_types
}
lm_eval/tasks/unitxt/task.py
View file @
2ebef470
...
@@ -6,7 +6,6 @@ Addressing this need, we present Unitxt, an innovative library for customizable
...
@@ -6,7 +6,6 @@ Addressing this need, we present Unitxt, an innovative library for customizable
import
importlib.util
import
importlib.util
import
re
import
re
from
collections.abc
import
Callable
from
functools
import
partial
from
functools
import
partial
from
typing
import
Any
,
Dict
,
Optional
from
typing
import
Any
,
Dict
,
Optional
...
@@ -110,18 +109,10 @@ class Unitxt(ConfigurableTask):
...
@@ -110,18 +109,10 @@ class Unitxt(ConfigurableTask):
def
get_arguments
(
self
,
doc
,
ctx
):
def
get_arguments
(
self
,
doc
,
ctx
):
return
(
ctx
,
{
"until"
:
[
"
\n
"
]})
return
(
ctx
,
{
"until"
:
[
"
\n
"
]})
def
fewshot_context
(
def
fewshot_context
(
self
,
doc
,
**
kwargs
)
->
str
:
self
,
doc
:
str
,
num_fewshot
:
int
,
system_instruction
:
Optional
[
str
]
=
None
,
apply_chat_template
:
bool
=
False
,
fewshot_as_multiturn
:
bool
=
False
,
chat_template
:
Optional
[
Callable
]
=
None
,
gen_prefix
:
Optional
[
str
]
=
None
,
)
->
str
:
if
isinstance
(
self
.
doc_to_text
(
doc
),
list
):
if
isinstance
(
self
.
doc_to_text
(
doc
),
list
):
if
apply_chat_template
:
if
kwargs
.
get
(
"apply_chat_template"
):
chat_template
=
kwargs
.
get
(
"chat_template"
)
formated_source
=
chat_template
(
self
.
doc_to_text
(
doc
))
formated_source
=
chat_template
(
self
.
doc_to_text
(
doc
))
return
formated_source
return
formated_source
else
:
else
:
...
@@ -129,15 +120,7 @@ class Unitxt(ConfigurableTask):
...
@@ -129,15 +120,7 @@ class Unitxt(ConfigurableTask):
"Got chat template format from Unitxt, but apply_chat_template is false. Add '--apply_chat_template' to command line."
"Got chat template format from Unitxt, but apply_chat_template is false. Add '--apply_chat_template' to command line."
)
)
else
:
else
:
return
super
().
fewshot_context
(
return
super
().
fewshot_context
(
doc
=
doc
,
**
kwargs
)
doc
=
doc
,
num_fewshot
=
num_fewshot
,
system_instruction
=
system_instruction
,
apply_chat_template
=
apply_chat_template
,
fewshot_as_multiturn
=
fewshot_as_multiturn
,
chat_template
=
chat_template
,
gen_prefix
=
gen_prefix
,
)
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
def
construct_requests
(
self
,
doc
,
ctx
,
**
kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of
"""Uses RequestFactory to construct Requests and returns an iterable of
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment