Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
655a807f
Unverified
Commit
655a807f
authored
Aug 21, 2023
by
philipwangOvO
Committed by
GitHub
Aug 21, 2023
Browse files
[Dataset] LongBench (#236)
Co-authored-by:
wangchonghua
<
wangchonghua@pjlab.org.cn
>
parent
c6a34949
Changes
66
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
731 additions
and
0 deletions
+731
-0
configs/summarizers/longbench.py
configs/summarizers/longbench.py
+35
-0
opencompass/datasets/__init__.py
opencompass/datasets/__init__.py
+1
-0
opencompass/datasets/longbench/__init__.py
opencompass/datasets/longbench/__init__.py
+26
-0
opencompass/datasets/longbench/evaluators.py
opencompass/datasets/longbench/evaluators.py
+264
-0
opencompass/datasets/longbench/longbench_2wikim_qa.py
opencompass/datasets/longbench/longbench_2wikim_qa.py
+26
-0
opencompass/datasets/longbench/longbench_dureader.py
opencompass/datasets/longbench/longbench_dureader.py
+26
-0
opencompass/datasets/longbench/longbench_gov_report.py
opencompass/datasets/longbench/longbench_gov_report.py
+21
-0
opencompass/datasets/longbench/longbench_hotpot_qa.py
opencompass/datasets/longbench/longbench_hotpot_qa.py
+26
-0
opencompass/datasets/longbench/longbench_lcc.py
opencompass/datasets/longbench/longbench_lcc.py
+21
-0
opencompass/datasets/longbench/longbench_lsht.py
opencompass/datasets/longbench/longbench_lsht.py
+30
-0
opencompass/datasets/longbench/longbench_multifieldqa_en.py
opencompass/datasets/longbench/longbench_multifieldqa_en.py
+26
-0
opencompass/datasets/longbench/longbench_multifieldqa_zh.py
opencompass/datasets/longbench/longbench_multifieldqa_zh.py
+26
-0
opencompass/datasets/longbench/longbench_musique.py
opencompass/datasets/longbench/longbench_musique.py
+26
-0
opencompass/datasets/longbench/longbench_narrative_qa.py
opencompass/datasets/longbench/longbench_narrative_qa.py
+26
-0
opencompass/datasets/longbench/longbench_nq.py
opencompass/datasets/longbench/longbench_nq.py
+26
-0
opencompass/datasets/longbench/longbench_passage_count.py
opencompass/datasets/longbench/longbench_passage_count.py
+21
-0
opencompass/datasets/longbench/longbench_passage_retrieval_en.py
...pass/datasets/longbench/longbench_passage_retrieval_en.py
+26
-0
opencompass/datasets/longbench/longbench_passage_retrieval_zh.py
...pass/datasets/longbench/longbench_passage_retrieval_zh.py
+26
-0
opencompass/datasets/longbench/longbench_qasper.py
opencompass/datasets/longbench/longbench_qasper.py
+26
-0
opencompass/datasets/longbench/longbench_qmsum.py
opencompass/datasets/longbench/longbench_qmsum.py
+26
-0
No files found.
configs/summarizers/longbench.py
0 → 100644
View file @
655a807f
summarizer
=
dict
(
dataset_abbrs
=
[
'--------- LongBench Single-Document QA ---------'
,
# category
"LongBench_narrativeqa"
,
'LongBench_qasper'
,
'LongBench_multifieldqa_en'
,
"LongBench_multifieldqa_zh"
,
'--------- LongBench Multi-Document QA ---------'
,
# category
'LongBench_hotpotqa'
,
'LongBench_2wikimqa'
,
'LongBench_musique'
,
'LongBench_dureader'
,
'--------- LongBench Summarization ---------'
,
# category
'LongBench_gov_report'
,
'LongBench_qmsum'
,
'LongBench_vcsum'
,
'--------- LongBench Few-shot Learning ---------'
,
# category
'LongBench_trec'
,
'LongBench_nq'
,
'LongBench_triviaqa'
,
'LongBench_lsht'
,
'--------- LongBench Code Completion ---------'
,
# category
'LongBench_lcc'
,
'LongBench_repobench-p'
,
'--------- LongBench Synthetic Tasks ---------'
,
# category
'LongBench_passage_retrieval_en'
,
'LongBench_passage_count'
,
'LongBench_passage_retrieval_zh'
,
],
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[]),
prompt_db
=
dict
(
database_path
=
'configs/datasets/log.json'
,
config_dir
=
'configs/datasets'
,
blacklist
=
'.promptignore'
),
)
opencompass/datasets/__init__.py
View file @
655a807f
...
...
@@ -54,6 +54,7 @@ from .LEval_scientific_qa import * # noqa: F401, F403
from
.LEval_topic_retrieval
import
*
# noqa: F401, F403
from
.LEval_tpo
import
*
# noqa: F401, F403
from
.LEval_tvshow_summ
import
*
# noqa: F401, F403
from
.longbench
import
*
# noqa: F401, F403
from
.math
import
*
# noqa: F401, F403
from
.mbpp
import
*
# noqa: F401, F403
from
.mmlu
import
*
# noqa: F401, F403
...
...
opencompass/datasets/longbench/__init__.py
0 → 100644
View file @
655a807f
from
.evaluators
import
LongBenchClassificationEvaluator
# noqa: F401, F403
from
.evaluators
import
LongBenchCodeSimEvaluator
# noqa: F401, F403
from
.evaluators
import
LongBenchCountEvaluator
# noqa: F401, F403
from
.evaluators
import
LongBenchF1Evaluator
# noqa: F401, F403
from
.evaluators
import
LongBenchRetrievalEvaluator
# noqa: F401, F403
from
.evaluators
import
LongBenchRougeEvaluator
# noqa: F401, F403
from
.longbench_2wikim_qa
import
*
# noqa: F401, F403
from
.longbench_dureader
import
*
# noqa: F401, F403
from
.longbench_gov_report
import
*
# noqa: F401, F403
from
.longbench_hotpot_qa
import
*
# noqa: F401, F403
from
.longbench_lcc
import
*
# noqa: F401, F403
from
.longbench_lsht
import
*
# noqa: F401, F403
from
.longbench_multifieldqa_en
import
*
# noqa: F401, F403
from
.longbench_multifieldqa_zh
import
*
# noqa: F401, F403
from
.longbench_musique
import
*
# noqa: F401, F403
from
.longbench_narrative_qa
import
*
# noqa: F401, F403
from
.longbench_nq
import
*
# noqa: F401, F403
from
.longbench_passage_count
import
*
# noqa: F401, F403
from
.longbench_passage_retrieval_en
import
*
# noqa: F401, F403
from
.longbench_passage_retrieval_zh
import
*
# noqa: F401, F403
from
.longbench_qasper
import
*
# noqa: F401, F403
from
.longbench_qmsum
import
*
# noqa: F401, F403
from
.longbench_repobench
import
*
# noqa: F401, F403
from
.longbench_trec
import
*
# noqa: F401, F403
from
.longbench_trivia_qa
import
*
# noqa: F401, F403
from
.longbench_vcsum
import
*
# noqa: F401, F403
opencompass/datasets/longbench/evaluators.py
0 → 100644
View file @
655a807f
import
difflib
import
re
import
string
from
collections
import
Counter
from
typing
import
List
import
jieba
from
fuzzywuzzy
import
fuzz
from
rouge
import
Rouge
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
ICL_EVALUATORS
def
normalize_answer
(
s
):
"""Lower text and remove punctuation, articles and extra whitespace."""
def
remove_articles
(
text
):
return
re
.
sub
(
r
'\b(a|an|the)\b'
,
' '
,
text
)
def
white_space_fix
(
text
):
return
' '
.
join
(
text
.
split
())
def
remove_punc
(
text
):
exclude
=
set
(
string
.
punctuation
)
return
''
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_articles
(
remove_punc
(
lower
(
s
))))
def
normalize_zh_answer
(
s
):
"""Lower text and remove punctuation, extra whitespace."""
def
white_space_fix
(
text
):
return
''
.
join
(
text
.
split
())
def
remove_punc
(
text
):
cn_punctuation
=
'!?。。"#$%&'()*+,-/:;<=>@[\]^_`
\
{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.'
all_punctuation
=
set
(
string
.
punctuation
+
cn_punctuation
)
return
''
.
join
(
ch
for
ch
in
text
if
ch
not
in
all_punctuation
)
def
lower
(
text
):
return
text
.
lower
()
return
white_space_fix
(
remove_punc
(
lower
(
s
)))
@
ICL_EVALUATORS
.
register_module
()
class
LongBenchF1Evaluator
(
BaseEvaluator
):
def
__init__
(
self
,
language
:
str
=
'en'
)
->
None
:
super
().
__init__
()
assert
language
in
[
'en'
,
'zh'
]
self
.
language
=
language
def
score
(
self
,
predictions
:
List
,
references
:
List
)
->
dict
:
def
f1_score
(
prediction
,
reference
,
**
kwargs
):
common
=
Counter
(
prediction
)
&
Counter
(
reference
)
num_same
=
sum
(
common
.
values
())
if
num_same
==
0
:
return
0
precision
=
1.0
*
num_same
/
len
(
prediction
)
recall
=
1.0
*
num_same
/
len
(
reference
)
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
f1
score
=
0.
for
i
in
range
(
len
(
predictions
)):
prediction
=
predictions
[
i
]
reference_list
=
references
[
i
]
task_score
=
0.
for
reference
in
reference_list
:
if
self
.
language
==
'en'
:
normalized_prediction
=
normalize_answer
(
prediction
)
normalized_reference
=
normalize_answer
(
reference
)
prediction_tokens
=
normalized_prediction
.
split
()
reference_tokens
=
normalized_reference
.
split
()
else
:
prediction_tokens
=
list
(
jieba
.
cut
(
prediction
,
cut_all
=
False
))
reference_tokens
=
list
(
jieba
.
cut
(
reference
,
cut_all
=
False
))
prediction_tokens
=
[
normalize_zh_answer
(
token
)
for
token
in
prediction_tokens
]
reference_tokens
=
[
normalize_zh_answer
(
token
)
for
token
in
reference_tokens
]
prediction_tokens
=
[
token
for
token
in
prediction_tokens
if
len
(
token
)
>
0
]
reference_tokens
=
[
token
for
token
in
reference_tokens
if
len
(
token
)
>
0
]
task_score
=
max
(
task_score
,
f1_score
(
prediction_tokens
,
reference_tokens
))
score
+=
task_score
score
=
score
/
len
(
predictions
)
*
100
return
{
'score'
:
score
}
@
ICL_EVALUATORS
.
register_module
()
class
LongBenchCountEvaluator
(
BaseEvaluator
):
def
score
(
self
,
predictions
:
List
,
references
:
List
)
->
dict
:
score
=
0.
for
i
in
range
(
len
(
predictions
)):
prediction
=
predictions
[
i
]
reference_list
=
references
[
i
]
for
reference
in
reference_list
:
numbers
=
re
.
findall
(
r
'\d+'
,
prediction
)
right_num
=
0
for
number
in
numbers
:
if
str
(
number
)
==
str
(
reference
):
right_num
+=
1
score
+=
0.0
if
len
(
numbers
)
==
0
else
float
(
right_num
/
len
(
numbers
))
score
=
score
/
len
(
predictions
)
*
100
return
{
'score'
:
score
}
@
ICL_EVALUATORS
.
register_module
()
class
LongBenchRetrievalEvaluator
(
BaseEvaluator
):
def
__init__
(
self
,
language
:
str
=
'en'
)
->
None
:
super
().
__init__
()
assert
language
in
[
'en'
,
'zh'
]
self
.
language
=
language
def
score
(
self
,
predictions
:
List
,
references
:
List
)
->
dict
:
score
=
0.
for
i
in
range
(
len
(
predictions
)):
prediction
=
predictions
[
i
]
reference_list
=
references
[
i
]
for
reference
in
reference_list
:
if
self
.
language
==
'en'
:
pattern
=
r
'Paragraph (\d+)'
else
:
pattern
=
r
'段落(\d+)'
matches
=
re
.
findall
(
pattern
,
reference
)
reference_id
=
matches
[
0
]
numbers
=
re
.
findall
(
r
'\d+'
,
prediction
)
right_num
=
0
for
number
in
numbers
:
if
str
(
number
)
==
str
(
reference_id
):
right_num
+=
1
score
+=
0.0
if
len
(
numbers
)
==
0
else
float
(
right_num
/
len
(
numbers
))
score
=
score
/
len
(
predictions
)
*
100
return
{
'score'
:
score
}
@
ICL_EVALUATORS
.
register_module
()
class
LongBenchRougeEvaluator
(
BaseEvaluator
):
def
__init__
(
self
,
language
:
str
=
'en'
)
->
None
:
super
().
__init__
()
assert
language
in
[
'en'
,
'zh'
]
self
.
language
=
language
def
score
(
self
,
predictions
:
List
,
references
:
List
)
->
dict
:
score
=
0.
for
i
in
range
(
len
(
predictions
)):
prediction
=
predictions
[
i
]
reference_list
=
references
[
i
]
task_score
=
0.
for
reference
in
reference_list
:
if
self
.
language
==
'zh'
:
prediction
=
' '
.
join
(
list
(
jieba
.
cut
(
prediction
,
cut_all
=
False
)))
reference
=
' '
.
join
(
list
(
jieba
.
cut
(
reference
,
cut_all
=
False
)))
rouge
=
Rouge
()
if
prediction
!=
''
:
cur_score
=
rouge
.
get_scores
([
prediction
],
[
reference
],
avg
=
True
)[
'rouge-l'
][
'f'
]
else
:
cur_score
=
0.
task_score
=
max
(
task_score
,
cur_score
)
score
+=
task_score
score
=
score
/
len
(
predictions
)
*
100
return
{
'score'
:
score
}
@
ICL_EVALUATORS
.
register_module
()
class
LongBenchCodeSimEvaluator
(
BaseEvaluator
):
def
score
(
self
,
predictions
:
List
,
references
:
List
)
->
dict
:
score
=
0.
for
i
in
range
(
len
(
predictions
)):
prediction
=
predictions
[
i
]
reference_list
=
references
[
i
]
task_score
=
0.
for
reference
in
reference_list
:
all_lines
=
prediction
.
lstrip
(
'
\n
'
).
split
(
'
\n
'
)
prediction
=
''
for
line
in
all_lines
:
if
(
'`'
not
in
line
)
and
(
'#'
not
in
line
)
and
(
'//'
not
in
line
):
prediction
=
line
break
task_score
=
max
(
task_score
,
(
fuzz
.
ratio
(
prediction
,
reference
)
/
100
))
score
+=
task_score
score
=
score
/
len
(
predictions
)
*
100
return
{
'score'
:
score
}
@
ICL_EVALUATORS
.
register_module
()
class
LongBenchClassificationEvaluator
(
BaseEvaluator
):
def
score
(
self
,
predictions
:
List
,
references
:
List
)
->
dict
:
score
=
0.
for
i
in
range
(
len
(
predictions
)):
prediction
=
predictions
[
i
]
reference_list
=
references
[
i
][
'answers'
]
for
reference
in
reference_list
:
em_match_list
=
[]
all_classes
=
references
[
i
][
'all_classes'
]
for
class_name
in
all_classes
:
if
class_name
in
prediction
:
em_match_list
.
append
(
class_name
)
for
match_term
in
em_match_list
:
if
match_term
in
reference
and
match_term
!=
reference
:
em_match_list
.
remove
(
match_term
)
if
em_match_list
!=
0
:
if
reference
in
em_match_list
:
score
+=
(
1.0
/
len
(
em_match_list
))
else
:
best_match
=
None
highest_similarity
=
0
for
names
in
all_classes
:
similarity
=
difflib
.
SequenceMatcher
(
None
,
names
,
prediction
).
ratio
()
if
similarity
>
highest_similarity
:
highest_similarity
=
similarity
best_match
=
names
score
+=
float
(
best_match
==
reference
)
score
=
score
/
len
(
predictions
)
*
100
return
{
'score'
:
score
}
opencompass/datasets/longbench/longbench_2wikim_qa.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBench2wikimqaDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_dureader.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchdureaderDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_gov_report.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchgov_reportDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_hotpot_qa.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchhotpotqaDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_lcc.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchlccDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_lsht.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchlshtDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
all_classes
=
dataset
[
split
][
'all_classes'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'all_labels'
:
{
'answers'
:
answers
,
'all_classes'
:
all_classes
}
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_multifieldqa_en.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchmultifieldqa_enDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_multifieldqa_zh.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchmultifieldqa_zhDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_musique.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchmusiqueDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_narrative_qa.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchnarrativeqaDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_nq.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchnqDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_passage_count.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchpassage_countDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_passage_retrieval_en.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchpassage_retrieval_enDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_passage_retrieval_zh.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchpassage_retrieval_zhDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_qasper.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchqasperDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
opencompass/datasets/longbench/longbench_qmsum.py
0 → 100644
View file @
655a807f
from
datasets
import
Dataset
,
load_dataset
from
opencompass.registry
import
LOAD_DATASET
from
..base
import
BaseDataset
@
LOAD_DATASET
.
register_module
()
class
LongBenchqmsumDataset
(
BaseDataset
):
@
staticmethod
def
load
(
**
kwargs
):
dataset
=
load_dataset
(
**
kwargs
)
split
=
'test'
raw_data
=
[]
for
i
in
range
(
len
(
dataset
[
split
])):
question
=
dataset
[
split
][
'input'
][
i
]
context
=
dataset
[
split
][
'context'
][
i
]
answers
=
dataset
[
split
][
'answers'
][
i
]
raw_data
.
append
({
'input'
:
question
,
'context'
:
context
,
'answers'
:
answers
})
dataset
[
split
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment