Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
3bb3d330
Unverified
Commit
3bb3d330
authored
Sep 27, 2023
by
philipwangOvO
Committed by
GitHub
Sep 27, 2023
Browse files
[Sync] Update LongEval (#443)
parent
2bb7beec
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
75 additions
and
5 deletions
+75
-5
opencompass/datasets/leval/evaluators.py
opencompass/datasets/leval/evaluators.py
+30
-0
opencompass/datasets/longbench/__init__.py
opencompass/datasets/longbench/__init__.py
+2
-1
opencompass/datasets/longbench/evaluators.py
opencompass/datasets/longbench/evaluators.py
+2
-2
opencompass/datasets/longbench/longbench_multi_news.py
opencompass/datasets/longbench/longbench_multi_news.py
+21
-0
opencompass/datasets/longbench/longbench_samsum.py
opencompass/datasets/longbench/longbench_samsum.py
+1
-1
opencompass/models/huggingface.py
opencompass/models/huggingface.py
+19
-1
No files found.
opencompass/datasets/leval/evaluators.py
View file @
3bb3d330
...
@@ -4,6 +4,7 @@ from typing import List
...
@@ -4,6 +4,7 @@ from typing import List
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
ICL_EVALUATORS
from
opencompass.registry
import
ICL_EVALUATORS
from
opencompass.utils.prompt
import
PromptList
from
opencompass.utils.prompt
import
PromptList
from
opencompass.utils.text_postprocessors
import
general_postprocess
@
ICL_EVALUATORS
.
register_module
()
@
ICL_EVALUATORS
.
register_module
()
...
@@ -107,3 +108,32 @@ class LEvalGPTEvaluator(BaseEvaluator):
...
@@ -107,3 +108,32 @@ class LEvalGPTEvaluator(BaseEvaluator):
score
=
score
/
(
num_samples
-
bad_case
)
*
100
score
=
score
/
(
num_samples
-
bad_case
)
*
100
return
{
'score'
:
score
}
return
{
'score'
:
score
}
@
ICL_EVALUATORS
.
register_module
()
class
LEvalEMEvaluator
(
BaseEvaluator
):
"""Exact match evaluator."""
def
__init__
(
self
)
->
None
:
super
().
__init__
()
def
score
(
self
,
predictions
,
references
):
if
len
(
predictions
)
!=
len
(
references
):
return
{
'error'
:
'predictions and references have different '
'length'
}
predictions
=
[
general_postprocess
(
prediction
)
for
prediction
in
predictions
]
processed_answers
=
[
general_postprocess
(
i
)
for
i
in
references
]
cnt
=
0
for
pred
,
ans
,
origin_ans
in
zip
(
predictions
,
processed_answers
,
references
):
if
ans
in
pred
or
origin_ans
in
pred
:
cnt
+=
1
score
=
cnt
/
len
(
predictions
)
*
100
return
{
'score'
:
score
}
opencompass/datasets/longbench/__init__.py
View file @
3bb3d330
...
@@ -10,17 +10,18 @@ from .longbench_gov_report import * # noqa: F401, F403
...
@@ -10,17 +10,18 @@ from .longbench_gov_report import * # noqa: F401, F403
from
.longbench_hotpot_qa
import
*
# noqa: F401, F403
from
.longbench_hotpot_qa
import
*
# noqa: F401, F403
from
.longbench_lcc
import
*
# noqa: F401, F403
from
.longbench_lcc
import
*
# noqa: F401, F403
from
.longbench_lsht
import
*
# noqa: F401, F403
from
.longbench_lsht
import
*
# noqa: F401, F403
from
.longbench_multi_news
import
*
# noqa: F401, F403
from
.longbench_multifieldqa_en
import
*
# noqa: F401, F403
from
.longbench_multifieldqa_en
import
*
# noqa: F401, F403
from
.longbench_multifieldqa_zh
import
*
# noqa: F401, F403
from
.longbench_multifieldqa_zh
import
*
# noqa: F401, F403
from
.longbench_musique
import
*
# noqa: F401, F403
from
.longbench_musique
import
*
# noqa: F401, F403
from
.longbench_narrative_qa
import
*
# noqa: F401, F403
from
.longbench_narrative_qa
import
*
# noqa: F401, F403
from
.longbench_nq
import
*
# noqa: F401, F403
from
.longbench_passage_count
import
*
# noqa: F401, F403
from
.longbench_passage_count
import
*
# noqa: F401, F403
from
.longbench_passage_retrieval_en
import
*
# noqa: F401, F403
from
.longbench_passage_retrieval_en
import
*
# noqa: F401, F403
from
.longbench_passage_retrieval_zh
import
*
# noqa: F401, F403
from
.longbench_passage_retrieval_zh
import
*
# noqa: F401, F403
from
.longbench_qasper
import
*
# noqa: F401, F403
from
.longbench_qasper
import
*
# noqa: F401, F403
from
.longbench_qmsum
import
*
# noqa: F401, F403
from
.longbench_qmsum
import
*
# noqa: F401, F403
from
.longbench_repobench
import
*
# noqa: F401, F403
from
.longbench_repobench
import
*
# noqa: F401, F403
from
.longbench_samsum
import
*
# noqa: F401, F403
from
.longbench_trec
import
*
# noqa: F401, F403
from
.longbench_trec
import
*
# noqa: F401, F403
from
.longbench_trivia_qa
import
*
# noqa: F401, F403
from
.longbench_trivia_qa
import
*
# noqa: F401, F403
from
.longbench_vcsum
import
*
# noqa: F401, F403
from
.longbench_vcsum
import
*
# noqa: F401, F403
opencompass/datasets/longbench/evaluators.py
View file @
3bb3d330
...
@@ -189,10 +189,10 @@ class LongBenchRougeEvaluator(BaseEvaluator):
...
@@ -189,10 +189,10 @@ class LongBenchRougeEvaluator(BaseEvaluator):
list
(
jieba
.
cut
(
reference
,
cut_all
=
False
)))
list
(
jieba
.
cut
(
reference
,
cut_all
=
False
)))
rouge
=
Rouge
()
rouge
=
Rouge
()
if
prediction
!=
''
:
try
:
cur_score
=
rouge
.
get_scores
([
prediction
],
[
reference
],
cur_score
=
rouge
.
get_scores
([
prediction
],
[
reference
],
avg
=
True
)[
'rouge-l'
][
'f'
]
avg
=
True
)[
'rouge-l'
][
'f'
]
e
lse
:
e
xcept
Exception
:
cur_score
=
0.
cur_score
=
0.
task_score
=
max
(
task_score
,
cur_score
)
task_score
=
max
(
task_score
,
cur_score
)
...
...
opencompass/datasets/longbench/longbench_multi_news.py
0 → 100644
View file @
3bb3d330
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchmulti_newsDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_
nq
.py
→
opencompass/datasets/longbench/longbench_
samsum
.py
View file @
3bb3d330
...
@@ -6,7 +6,7 @@ from ..base import BaseDataset
...
@@ -6,7 +6,7 @@ from ..base import BaseDataset
@
LOAD_DATASET
.
register_module
()
@
LOAD_DATASET
.
register_module
()
class
LongBench
nq
Dataset
(
BaseDataset
):
class
LongBench
samsum
Dataset
(
BaseDataset
):
@
staticmethod
@
staticmethod
def
load
(
**
kwargs
):
def
load
(
**
kwargs
):
...
...
opencompass/models/huggingface.py
View file @
3bb3d330
...
@@ -42,6 +42,9 @@ class HuggingFace(BaseModel):
...
@@ -42,6 +42,9 @@ class HuggingFace(BaseModel):
without batch padding.
without batch padding.
pad_token_id (int): The id of the padding token. Defaults to None. Use
pad_token_id (int): The id of the padding token. Defaults to None. Use
(#vocab + pad_token_id) if get negative value.
(#vocab + pad_token_id) if get negative value.
mode (str, optional): The method of input truncation when input length
exceeds max_seq_len. 'mid' represents the part of input to
truncate. Defaults to 'none'.
Note:
Note:
About ``extract_pred_after_decode``: Commonly, we should extract the
About ``extract_pred_after_decode``: Commonly, we should extract the
...
@@ -62,7 +65,8 @@ class HuggingFace(BaseModel):
...
@@ -62,7 +65,8 @@ class HuggingFace(BaseModel):
meta_template
:
Optional
[
Dict
]
=
None
,
meta_template
:
Optional
[
Dict
]
=
None
,
extract_pred_after_decode
:
bool
=
False
,
extract_pred_after_decode
:
bool
=
False
,
batch_padding
:
bool
=
False
,
batch_padding
:
bool
=
False
,
pad_token_id
:
Optional
[
int
]
=
None
):
pad_token_id
:
Optional
[
int
]
=
None
,
mode
:
str
=
'none'
):
super
().
__init__
(
path
=
path
,
super
().
__init__
(
path
=
path
,
max_seq_len
=
max_seq_len
,
max_seq_len
=
max_seq_len
,
tokenizer_only
=
tokenizer_only
,
tokenizer_only
=
tokenizer_only
,
...
@@ -73,6 +77,8 @@ class HuggingFace(BaseModel):
...
@@ -73,6 +77,8 @@ class HuggingFace(BaseModel):
patch_hf_auto_model
(
hf_cache_dir
)
patch_hf_auto_model
(
hf_cache_dir
)
self
.
logger
=
get_logger
()
self
.
logger
=
get_logger
()
self
.
pad_token_id
=
pad_token_id
self
.
pad_token_id
=
pad_token_id
assert
mode
in
[
'none'
,
'mid'
]
self
.
mode
=
mode
self
.
_load_tokenizer
(
path
=
path
,
self
.
_load_tokenizer
(
path
=
path
,
tokenizer_path
=
tokenizer_path
,
tokenizer_path
=
tokenizer_path
,
tokenizer_kwargs
=
tokenizer_kwargs
)
tokenizer_kwargs
=
tokenizer_kwargs
)
...
@@ -228,6 +234,18 @@ class HuggingFace(BaseModel):
...
@@ -228,6 +234,18 @@ class HuggingFace(BaseModel):
if
self
.
extract_pred_after_decode
:
if
self
.
extract_pred_after_decode
:
prompt_lens
=
[
len
(
input_
)
for
input_
in
inputs
]
prompt_lens
=
[
len
(
input_
)
for
input_
in
inputs
]
if
self
.
mode
==
'mid'
:
input_ids
=
self
.
tokenizer
(
inputs
,
truncation
=
False
)[
'input_ids'
]
input_ids
=
torch
.
tensor
(
input_ids
,
device
=
self
.
model
.
device
)
if
len
(
input_ids
[
0
])
>
self
.
max_seq_len
-
max_out_len
:
half
=
int
((
self
.
max_seq_len
-
max_out_len
)
/
2
)
inputs
=
[
self
.
tokenizer
.
decode
(
input_ids
[
0
][:
half
],
skip_special_tokens
=
True
)
+
self
.
tokenizer
.
decode
(
input_ids
[
0
][
-
half
:],
skip_special_tokens
=
True
)
]
input_ids
=
self
.
tokenizer
(
inputs
,
input_ids
=
self
.
tokenizer
(
inputs
,
truncation
=
True
,
truncation
=
True
,
max_length
=
self
.
max_seq_len
-
max_length
=
self
.
max_seq_len
-
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment