Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
a8601618
Commit
a8601618
authored
Dec 18, 2024
by
Baber
Browse files
add hotpotqa_e
parent
8558b8d4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
296 additions
and
0 deletions
+296
-0
lm_eval/tasks/longbench/longbench.yaml
lm_eval/tasks/longbench/longbench.yaml
+20
-0
lm_eval/tasks/longbench/metrics.py
lm_eval/tasks/longbench/metrics.py
+153
-0
lm_eval/tasks/longbench/utils.py
lm_eval/tasks/longbench/utils.py
+123
-0
No files found.
lm_eval/tasks/longbench/longbench.yaml
0 → 100644
View file @
a8601618
task
:
longbench
dataset_path
:
THUDM/LongBench
dataset_name
:
hotpotqa_e
output_type
:
generate_until
test_split
:
test
doc_to_text
:
"
Answer
the
question
based
on
the
given
passages.
Only
give
me
the
answer
and
do
not
output
any
other
words.
\n\n
The
following
are
given
passages.
\n
{{context}}
\n\n
Answer
the
question
based
on
the
given
passages.
Only
give
me
the
answer
and
do
not
output
any
other
words.
\n\n
Question:
{{input}}
\n
Answer:"
doc_to_target
:
"
{{answers}}"
generation_kwargs
:
max_gen_toks
:
32
temperature
:
1
do_sample
:
false
metric_list
:
-
metric
:
!function
metrics.qa_f1_score
aggregation
:
mean
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
1.0
lm_eval/tasks/longbench/metrics.py
0 → 100644
View file @
a8601618
import
re
import
string
from
collections
import
Counter
import
jieba
from
fuzzywuzzy
import
fuzz
from
rouge
import
Rouge
def
normalize_answer
(
s
:
str
):
"""Lower text and remove punctuation, articles and extra whitespace."""
def
remove_articles
(
text
):
return
re
.
sub
(
r
"\b(a|an|the)\b"
,
" "
,
text
)
def
white_space_fix
(
text
:
str
):
return
" "
.
join
(
text
.
split
())
def
remove_punc
(
text
):
exclude
=
set
(
string
.
punctuation
)
return
""
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_articles
(
remove_punc
(
lower
(
s
))))
def
normalize_zh_answer
(
s
:
str
):
"""Lower text and remove punctuation, extra whitespace."""
def
white_space_fix
(
text
):
return
""
.
join
(
text
.
split
())
def
remove_punc
(
text
):
cn_punctuation
=
"!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
all_punctuation
=
set
(
string
.
punctuation
+
cn_punctuation
)
return
""
.
join
(
ch
for
ch
in
text
if
ch
not
in
all_punctuation
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_punc
(
lower
(
s
)))
def
count_score
(
prediction
,
ground_truth
,
**
kwargs
):
numbers
=
re
.
findall
(
r
"\d+"
,
prediction
)
right_num
=
0
for
number
in
numbers
:
if
str
(
number
)
==
str
(
ground_truth
):
right_num
+=
1
final_score
=
0.0
if
len
(
numbers
)
==
0
else
right_num
/
len
(
numbers
)
return
float
(
final_score
)
def
retrieval_score
(
prediction
,
ground_truth
,
**
kwargs
):
pattern
=
r
"Paragraph (\d+)"
matches
=
re
.
findall
(
pattern
,
ground_truth
)
ground_truth_id
=
matches
[
0
]
numbers
=
re
.
findall
(
r
"\d+"
,
prediction
)
right_num
=
0
for
number
in
numbers
:
if
str
(
number
)
==
str
(
ground_truth_id
):
right_num
+=
1
final_score
=
0.0
if
len
(
numbers
)
==
0
else
right_num
/
len
(
numbers
)
return
float
(
final_score
)
def
retrieval_zh_score
(
prediction
,
ground_truth
,
**
kwargs
):
pattern
=
r
"段落(\d+)"
matches
=
re
.
findall
(
pattern
,
ground_truth
)
ground_truth_id
=
matches
[
0
]
numbers
=
re
.
findall
(
r
"\d+"
,
prediction
)
right_num
=
0
for
number
in
numbers
:
if
str
(
number
)
==
str
(
ground_truth_id
):
right_num
+=
1
final_score
=
0.0
if
len
(
numbers
)
==
0
else
right_num
/
len
(
numbers
)
return
float
(
final_score
)
def
code_sim_score
(
prediction
,
ground_truth
,
**
kwargs
):
all_lines
=
prediction
.
lstrip
(
"
\n
"
).
split
(
"
\n
"
)
prediction
=
""
for
line
in
all_lines
:
if
(
"`"
not
in
line
)
and
(
"#"
not
in
line
)
and
(
"//"
not
in
line
):
prediction
=
line
break
return
fuzz
.
ratio
(
prediction
,
ground_truth
)
/
100
def
classification_score
(
prediction
,
ground_truth
,
**
kwargs
):
em_match_list
=
[]
all_classes
=
kwargs
[
"all_classes"
]
for
class_name
in
all_classes
:
if
class_name
in
prediction
:
em_match_list
.
append
(
class_name
)
for
match_term
in
em_match_list
:
if
match_term
in
ground_truth
and
match_term
!=
ground_truth
:
em_match_list
.
remove
(
match_term
)
if
ground_truth
in
em_match_list
:
score
=
1.0
/
len
(
em_match_list
)
else
:
score
=
0.0
return
score
def
rouge_score
(
prediction
,
ground_truth
,
**
kwargs
):
rouge
=
Rouge
()
try
:
scores
=
rouge
.
get_scores
([
prediction
],
[
ground_truth
],
avg
=
True
)
# ruff: noqa
except
:
return
0.0
return
scores
[
"rouge-l"
][
"f"
]
def
rouge_zh_score
(
prediction
,
ground_truth
,
**
kwargs
):
prediction
=
" "
.
join
(
list
(
jieba
.
cut
(
prediction
,
cut_all
=
False
)))
ground_truth
=
" "
.
join
(
list
(
jieba
.
cut
(
ground_truth
,
cut_all
=
False
)))
score
=
rouge_score
(
prediction
,
ground_truth
)
return
score
def
f1_score
(
prediction
,
ground_truth
,
**
kwargs
):
common
=
Counter
(
prediction
)
&
Counter
(
ground_truth
)
num_same
=
sum
(
common
.
values
())
if
num_same
==
0
:
return
0
precision
=
1.0
*
num_same
/
len
(
prediction
)
recall
=
1.0
*
num_same
/
len
(
ground_truth
)
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
def
qa_f1_score
(
*
args
):
gold_answer
,
result
=
args
normalized_prediction
=
normalize_answer
(
result
)
normalized_ground_truth
=
normalize_answer
(
gold_answer
)
prediction_tokens
=
normalized_prediction
.
split
()
ground_truth_tokens
=
normalized_ground_truth
.
split
()
return
f1_score
(
prediction_tokens
,
ground_truth_tokens
)
def
qa_f1_zh_score
(
prediction
,
ground_truth
,
**
kwargs
):
prediction_tokens
=
list
(
jieba
.
cut
(
prediction
,
cut_all
=
False
))
ground_truth_tokens
=
list
(
jieba
.
cut
(
ground_truth
,
cut_all
=
False
))
prediction_tokens
=
[
normalize_zh_answer
(
token
)
for
token
in
prediction_tokens
]
ground_truth_tokens
=
[
normalize_zh_answer
(
token
)
for
token
in
ground_truth_tokens
]
prediction_tokens
=
[
token
for
token
in
prediction_tokens
if
len
(
token
)
>
0
]
ground_truth_tokens
=
[
token
for
token
in
ground_truth_tokens
if
len
(
token
)
>
0
]
return
f1_score
(
prediction_tokens
,
ground_truth_tokens
)
lm_eval/tasks/longbench/utils.py
0 → 100644
View file @
a8601618
import
argparse
import
json
import
os
import
numpy
as
np
from
metrics
import
(
classification_score
,
code_sim_score
,
count_score
,
qa_f1_score
,
qa_f1_zh_score
,
retrieval_score
,
retrieval_zh_score
,
rouge_score
,
rouge_zh_score
,
)
dataset2metric
=
{
"narrativeqa"
:
qa_f1_score
,
"qasper"
:
qa_f1_score
,
"multifieldqa_en"
:
qa_f1_score
,
"multifieldqa_zh"
:
qa_f1_zh_score
,
"hotpotqa"
:
qa_f1_score
,
"2wikimqa"
:
qa_f1_score
,
"musique"
:
qa_f1_score
,
"dureader"
:
rouge_zh_score
,
"gov_report"
:
rouge_score
,
"qmsum"
:
rouge_score
,
"multi_news"
:
rouge_score
,
"vcsum"
:
rouge_zh_score
,
"trec"
:
classification_score
,
"triviaqa"
:
qa_f1_score
,
"samsum"
:
rouge_score
,
"lsht"
:
classification_score
,
"passage_retrieval_en"
:
retrieval_score
,
"passage_count"
:
count_score
,
"passage_retrieval_zh"
:
retrieval_zh_score
,
"lcc"
:
code_sim_score
,
"repobench-p"
:
code_sim_score
,
}
# def parse_args(args=None):
# parser = argparse.ArgumentParser()
# parser.add_argument('--model', type=str, default=None)
# parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
# return parser.parse_args(args)
def
scorer_e
(
dataset
,
predictions
,
answers
,
lengths
,
all_classes
):
scores
=
{
"0-4k"
:
[],
"4-8k"
:
[],
"8k+"
:
[]}
for
prediction
,
ground_truths
,
length
in
zip
(
predictions
,
answers
,
lengths
):
score
=
0.0
if
dataset
in
[
"trec"
,
"triviaqa"
,
"samsum"
,
"lsht"
]:
prediction
=
prediction
.
lstrip
(
"
\n
"
).
split
(
"
\n
"
)[
0
]
for
ground_truth
in
ground_truths
:
score
=
max
(
score
,
dataset2metric
[
dataset
](
prediction
,
ground_truth
,
all_classes
=
all_classes
),
)
if
length
<
4000
:
scores
[
"0-4k"
].
append
(
score
)
elif
length
<
8000
:
scores
[
"4-8k"
].
append
(
score
)
else
:
scores
[
"8k+"
].
append
(
score
)
for
key
in
scores
.
keys
():
scores
[
key
]
=
round
(
100
*
np
.
mean
(
scores
[
key
]),
2
)
return
scores
def
scorer
(
dataset
,
predictions
,
answers
,
all_classes
):
total_score
=
0.0
for
prediction
,
ground_truths
in
zip
(
predictions
,
answers
):
score
=
0.0
if
dataset
in
[
"trec"
,
"triviaqa"
,
"samsum"
,
"lsht"
]:
prediction
=
prediction
.
lstrip
(
"
\n
"
).
split
(
"
\n
"
)[
0
]
for
ground_truth
in
ground_truths
:
score
=
max
(
score
,
dataset2metric
[
dataset
](
prediction
,
ground_truth
,
all_classes
=
all_classes
),
)
total_score
+=
score
return
round
(
100
*
total_score
/
len
(
predictions
),
2
)
# if __name__ == '__main__':
# args = parse_args()
# scores = dict()
# if args.e:
# path = f"pred_e/{args.model}/"
# else:
# path = f"pred/{args.model}/"
# all_files = os.listdir(path)
# print("Evaluating on:", all_files)
# for filename in all_files:
# if not filename.endswith("jsonl"):
# continue
# predictions, answers, lengths = [], [], []
# dataset = filename.split('.')[0]
# with open(f"{path}{filename}", "r", encoding="utf-8") as f:
# for line in f:
# data = json.loads(line)
# predictions.append(data["pred"])
# answers.append(data["answers"])
# all_classes = data["all_classes"]
# if "length" in data:
# lengths.append(data["length"])
# if args.e:
# score = scorer_e(dataset, predictions, answers, lengths, all_classes)
# else:
# score = scorer(dataset, predictions, answers, all_classes)
# scores[dataset] = score
# if args.e:
# out_path = f"pred_e/{args.model}/result.json"
# else:
# out_path = f"pred/{args.model}/result.json"
# with open(out_path, "w") as f:
# json.dump(scores, f, ensure_ascii=False, indent=4)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment