Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
9bd948df
Commit
9bd948df
authored
May 22, 2024
by
Konrad
Browse files
Merge branch 'main' into chat_template
parents
8a0ce59d
70e1de09
Changes
84
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
233 additions
and
0 deletions
+233
-0
lm_eval/tasks/tinyBenchmarks/tinyWinogrande.yaml
lm_eval/tasks/tinyBenchmarks/tinyWinogrande.yaml
+18
-0
lm_eval/tasks/tinyBenchmarks/utils_hellaswag.py
lm_eval/tasks/tinyBenchmarks/utils_hellaswag.py
+28
-0
lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py
lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py
+170
-0
lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
+17
-0
No files found.
lm_eval/tasks/tinyBenchmarks/tinyWinogrande.yaml
0 → 100644
View file @
9bd948df
task
:
tinyWinogrande
dataset_path
:
tinyBenchmarks/tinyWinogrande
dataset_name
:
winogrande_xl
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
num_fewshot
:
5
doc_to_text
:
!function
utils_winogrande.doc_to_text
doc_to_target
:
!function
utils_winogrande.doc_to_target
doc_to_choice
:
!function
utils_winogrande.doc_to_choice
should_decontaminate
:
true
doc_to_decontamination_query
:
sentence
metric_list
:
-
metric
:
acc_norm
aggregation
:
!function
agg_functions.agg_gpirt_winogrande
higher_is_better
:
true
metadata
:
version
:
0.0
lm_eval/tasks/tinyBenchmarks/utils_hellaswag.py
0 → 100644
View file @
9bd948df
import
re
import
datasets
""" This code mirrors the utils of the original hellaswag task """
def
preprocess
(
text
):
text
=
text
.
strip
()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text
=
text
.
replace
(
" [title]"
,
". "
)
text
=
re
.
sub
(
"
\\
[.*?
\\
]"
,
""
,
text
)
text
=
text
.
replace
(
" "
,
" "
)
return
text
def
process_docs
(
dataset
:
datasets
.
Dataset
)
->
datasets
.
Dataset
:
def
_process_doc
(
doc
):
ctx
=
doc
[
"ctx_a"
]
+
" "
+
doc
[
"ctx_b"
].
capitalize
()
out_doc
=
{
"query"
:
preprocess
(
doc
[
"activity_label"
]
+
": "
+
ctx
),
"choices"
:
[
preprocess
(
ending
)
for
ending
in
doc
[
"endings"
]],
"gold"
:
int
(
doc
[
"label"
]),
}
return
out_doc
return
dataset
.
map
(
_process_doc
)
lm_eval/tasks/tinyBenchmarks/utils_truthfulqa.py
0 → 100644
View file @
9bd948df
import
datasets
import
numpy
as
np
import
sacrebleu
from
rouge_score
import
rouge_scorer
,
scoring
""" This code mirrors the utils of the original truthful_qa task """
def
process_results_mc2
(
doc
,
results
):
lls
,
is_greedy
=
zip
(
*
results
)
# Split on the first `0` as everything before it is true (`1`).
split_idx
=
list
(
doc
[
"mc2_targets"
][
"labels"
]).
index
(
0
)
# Compute the normalized probability mass for the correct answer.
ll_true
,
ll_false
=
lls
[:
split_idx
],
lls
[
split_idx
:]
p_true
,
p_false
=
np
.
exp
(
np
.
array
(
ll_true
)),
np
.
exp
(
np
.
array
(
ll_false
))
p_true
=
p_true
/
(
sum
(
p_true
)
+
sum
(
p_false
))
return
{
"acc"
:
sum
(
p_true
)}
def
process_docs_gen
(
dataset
:
datasets
.
Dataset
)
->
datasets
.
Dataset
:
return
dataset
.
map
(
preprocess_function
)
def
preprocess_function
(
examples
):
def
_format_answers
(
answers
):
formatted_answers
=
[]
for
answer
in
answers
:
answer
=
answer
.
strip
()
if
len
(
answer
):
# Add a period after all answers.
if
answer
[
-
1
]
!=
"."
:
formatted_answers
.
append
(
answer
+
"."
)
else
:
formatted_answers
.
append
(
answer
)
return
formatted_answers
incorrect_answers
=
_format_answers
(
examples
[
"incorrect_answers"
])
correct_answers
=
_format_answers
(
examples
[
"correct_answers"
])
if
"I have no comment."
not
in
correct_answers
:
correct_answers
.
append
(
"I have no comment."
)
return
{
"question"
:
examples
[
"question"
].
strip
(),
"correct_answers"
:
correct_answers
,
"incorrect_answers"
:
incorrect_answers
,
}
def
process_results_gen
(
doc
,
results
):
completion
=
results
[
0
]
true_refs
,
false_refs
=
doc
[
"correct_answers"
],
doc
[
"incorrect_answers"
]
all_refs
=
true_refs
+
false_refs
# Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
# # BLEURT
# bleurt_scores_true = self.bleurt.compute(
# predictions=[completion] * len(true_refs), references=true_refs
# )["scores"]
# bleurt_scores_false = self.bleurt.compute(
# predictions=[completion] * len(false_refs), references=false_refs
# )["scores"]
# bleurt_correct = max(bleurt_scores_true)
# bleurt_incorrect = max(bleurt_scores_false)
# bleurt_max = bleurt_correct
# bleurt_diff = bleurt_correct - bleurt_incorrect
# bleurt_acc = int(bleurt_correct > bleurt_incorrect)
# BLEU
bleu_scores
=
[
bleu
([[
ref
]],
[
completion
])
for
ref
in
all_refs
]
bleu_correct
=
np
.
nanmax
(
bleu_scores
[:
len
(
true_refs
)])
bleu_incorrect
=
np
.
nanmax
(
bleu_scores
[
len
(
true_refs
)
:])
bleu_max
=
bleu_correct
bleu_diff
=
bleu_correct
-
bleu_incorrect
bleu_acc
=
int
(
bleu_correct
>
bleu_incorrect
)
# ROUGE-N
rouge_scores
=
[
rouge
([
ref
],
[
completion
])
for
ref
in
all_refs
]
# ROUGE-1
rouge1_scores
=
[
score
[
"rouge1"
]
for
score
in
rouge_scores
]
rouge1_correct
=
np
.
nanmax
(
rouge1_scores
[:
len
(
true_refs
)])
rouge1_incorrect
=
np
.
nanmax
(
rouge1_scores
[
len
(
true_refs
)
:])
rouge1_max
=
rouge1_correct
rouge1_diff
=
rouge1_correct
-
rouge1_incorrect
rouge1_acc
=
int
(
rouge1_correct
>
rouge1_incorrect
)
# ROUGE-2
rouge2_scores
=
[
score
[
"rouge2"
]
for
score
in
rouge_scores
]
rouge2_correct
=
np
.
nanmax
(
rouge2_scores
[:
len
(
true_refs
)])
rouge2_incorrect
=
np
.
nanmax
(
rouge2_scores
[
len
(
true_refs
)
:])
rouge2_max
=
rouge2_correct
rouge2_diff
=
rouge2_correct
-
rouge2_incorrect
rouge2_acc
=
int
(
rouge2_correct
>
rouge2_incorrect
)
# ROUGE-L
rougeL_scores
=
[
score
[
"rougeLsum"
]
for
score
in
rouge_scores
]
rougeL_correct
=
np
.
nanmax
(
rougeL_scores
[:
len
(
true_refs
)])
rougeL_incorrect
=
np
.
nanmax
(
rougeL_scores
[
len
(
true_refs
)
:])
rougeL_max
=
rougeL_correct
rougeL_diff
=
rougeL_correct
-
rougeL_incorrect
rougeL_acc
=
int
(
rougeL_correct
>
rougeL_incorrect
)
return
{
# "bleurt_max": bleurt_max,
# "bleurt_acc": bleurt_acc,
# "bleurt_diff": bleurt_diff,
"bleu_max"
:
bleu_max
,
"bleu_acc"
:
bleu_acc
,
"bleu_diff"
:
bleu_diff
,
"rouge1_max"
:
rouge1_max
,
"rouge1_acc"
:
rouge1_acc
,
"rouge1_diff"
:
rouge1_diff
,
"rouge2_max"
:
rouge2_max
,
"rouge2_acc"
:
rouge2_acc
,
"rouge2_diff"
:
rouge2_diff
,
"rougeL_max"
:
rougeL_max
,
"rougeL_acc"
:
rougeL_acc
,
"rougeL_diff"
:
rougeL_diff
,
}
def
bleu
(
refs
,
preds
):
"""
Returns `t5` style BLEU scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
:param refs:
A `list` of `list` of reference `str`s.
:param preds:
A `list` of predicted `str`s.
"""
score
=
sacrebleu
.
corpus_bleu
(
preds
,
refs
,
smooth_method
=
"exp"
,
smooth_value
=
0.0
,
force
=
False
,
lowercase
=
False
,
tokenize
=
"intl"
,
use_effective_order
=
False
,
).
score
return
score
def
rouge
(
refs
,
preds
):
"""
Returns `t5` style ROUGE scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
:param refs:
A `list` of reference `strs`.
:param preds:
A `list` of predicted `strs`.
"""
rouge_types
=
[
"rouge1"
,
"rouge2"
,
"rougeLsum"
]
scorer
=
rouge_scorer
.
RougeScorer
(
rouge_types
)
# Add newlines between sentences to correctly compute `rougeLsum`.
def
_prepare_summary
(
summary
):
summary
=
summary
.
replace
(
" . "
,
".
\n
"
)
return
summary
# Accumulate confidence intervals.
aggregator
=
scoring
.
BootstrapAggregator
()
for
ref
,
pred
in
zip
(
refs
,
preds
):
ref
=
_prepare_summary
(
ref
)
pred
=
_prepare_summary
(
pred
)
aggregator
.
add_scores
(
scorer
.
score
(
ref
,
pred
))
result
=
aggregator
.
aggregate
()
return
{
type
:
result
[
type
].
mid
.
fmeasure
*
100
for
type
in
rouge_types
}
lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
0 → 100644
View file @
9bd948df
""" This code mirrors the utils of the original winogrande task """
def
doc_to_text
(
doc
):
answer_to_num
=
{
"1"
:
0
,
"2"
:
1
}
return
answer_to_num
[
doc
[
"answer"
]]
def
doc_to_target
(
doc
):
idx
=
doc
[
"sentence"
].
index
(
"_"
)
+
1
return
doc
[
"sentence"
][
idx
:].
strip
()
def
doc_to_choice
(
doc
):
idx
=
doc
[
"sentence"
].
index
(
"_"
)
options
=
[
doc
[
"option1"
],
doc
[
"option2"
]]
return
[
doc
[
"sentence"
][:
idx
]
+
opt
for
opt
in
options
]
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment