Unverified Commit 3bb3d330 authored by philipwangOvO's avatar philipwangOvO Committed by GitHub
Browse files

[Sync] Update LongEval (#443)

parent 2bb7beec
......@@ -27,7 +27,7 @@ LEval_financialqa_infer_cfg = dict(
)
LEval_financialqa_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_govreport_summ_infer_cfg = dict(
)
LEval_govreport_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_legalqa_infer_cfg = dict(
)
LEval_legalqa_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_meetingsumm_infer_cfg = dict(
)
LEval_meetingsumm_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_narrativeqa_infer_cfg = dict(
)
LEval_narrativeqa_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator,),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_nq_infer_cfg = dict(
)
LEval_nq_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_newssumm_infer_cfg = dict(
)
LEval_newssumm_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_ps_summ_infer_cfg = dict(
)
LEval_ps_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_patent_summ_infer_cfg = dict(
)
LEval_patent_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_review_summ_infer_cfg = dict(
)
LEval_review_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_scientificqa_infer_cfg = dict(
)
LEval_scientificqa_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator
from opencompass.datasets.leval import LEvalTopicRetrievalDataset
from opencompass.datasets.leval import LEvalTopicRetrievalDataset, LEvalEMEvaluator
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi, general_postprocess
LEval_tr_reader_cfg = dict(
......@@ -28,7 +28,7 @@ LEval_tr_infer_cfg = dict(
)
LEval_tr_eval_cfg = dict(
evaluator=dict(type=EMEvaluator),
evaluator=dict(type=LEvalEMEvaluator),
pred_postprocessor=dict(type=general_postprocess),
pred_role='BOT'
)
......
......@@ -27,7 +27,7 @@ LEval_tvshow_summ_infer_cfg = dict(
)
LEval_tvshow_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)
......
......@@ -7,7 +7,6 @@ with read_base():
from .longbenchmultifieldqa_en.longbench_multifieldqa_en_gen import LongBench_multifieldqa_en_datasets
from .longbenchmultifieldqa_zh.longbench_multifieldqa_zh_gen import LongBench_multifieldqa_zh_datasets
from .longbenchnarrativeqa.longbench_narrativeqa_gen import LongBench_narrativeqa_datasets
from .longbenchnq.longbench_nq_gen import LongBench_nq_datasets
from .longbenchqasper.longbench_qasper_gen import LongBench_qasper_datasets
from .longbenchtriviaqa.longbench_triviaqa_gen import LongBench_triviaqa_datasets
from .longbenchgov_report.longbench_gov_report_gen import LongBench_gov_report_datasets
......@@ -21,5 +20,7 @@ with read_base():
from .longbenchpassage_count.longbench_passage_count_gen import LongBench_passage_count_datasets
from .longbenchtrec.longbench_trec_gen import LongBench_trec_datasets
from .longbenchlsht.longbench_lsht_gen import LongBench_lsht_datasets
from .longbenchmulti_news.longbench_multi_news_gen import LongBench_multi_news_datasets
from .longbenchsamsum.longbench_samsum_gen import LongBench_samsum_datasets
longbench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
\ No newline at end of file
from mmengine.config import read_base
with read_base():
from .longbench_multi_news_gen_f6e3fb import LongBench_multi_news_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LongBenchRougeEvaluator, LongBenchmulti_newsDataset
LongBench_multi_news_reader_cfg = dict(
input_columns=['context'],
output_column='answers',
train_split='test',
test_split='test'
)
LongBench_multi_news_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512)
)
LongBench_multi_news_eval_cfg = dict(
evaluator=dict(type=LongBenchRougeEvaluator),
pred_role='BOT'
)
LongBench_multi_news_datasets = [
dict(
type=LongBenchmulti_newsDataset,
abbr='LongBench_multi_news',
path='THUDM/LongBench',
name='multi_news',
reader_cfg=LongBench_multi_news_reader_cfg,
infer_cfg=LongBench_multi_news_infer_cfg,
eval_cfg=LongBench_multi_news_eval_cfg)
]
from mmengine.config import read_base
with read_base():
from .longbench_nq_gen_d30cb9 import LongBench_nq_datasets # noqa: F401, F403
from .longbench_samsum_gen_f4416d import LongBench_samsum_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LongBenchF1Evaluator, LongBenchnqDataset
from opencompass.datasets import LongBenchRougeEvaluator, LongBenchsamsumDataset
LongBench_nq_reader_cfg = dict(
LongBench_samsum_reader_cfg = dict(
input_columns=['context', 'input'],
output_column='answers',
train_split='test',
test_split='test'
)
LongBench_nq_infer_cfg = dict(
LongBench_samsum_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}'),
dict(role='HUMAN', prompt='Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=32)
inferencer=dict(type=GenInferencer, max_out_len=128)
)
LongBench_nq_eval_cfg = dict(
evaluator=dict(type=LongBenchF1Evaluator),
LongBench_samsum_eval_cfg = dict(
evaluator=dict(type=LongBenchRougeEvaluator),
pred_role='BOT'
)
LongBench_nq_datasets = [
LongBench_samsum_datasets = [
dict(
type=LongBenchnqDataset,
abbr='LongBench_nq',
type=LongBenchsamsumDataset,
abbr='LongBench_samsum',
path='THUDM/LongBench',
name='nq',
reader_cfg=LongBench_nq_reader_cfg,
infer_cfg=LongBench_nq_infer_cfg,
eval_cfg=LongBench_nq_eval_cfg)
name='samsum',
reader_cfg=LongBench_samsum_reader_cfg,
infer_cfg=LongBench_samsum_infer_cfg,
eval_cfg=LongBench_samsum_eval_cfg)
]
......@@ -13,19 +13,20 @@ summarizer = dict(
'--------- LongBench Summarization ---------', # category
'LongBench_gov_report',
'LongBench_qmsum',
'LongBench_multi_news',
'LongBench_vcsum',
'--------- LongBench Few-shot Learning ---------', # category
'LongBench_trec',
'LongBench_nq',
'LongBench_triviaqa',
'LongBench_samsum',
'LongBench_lsht',
'--------- LongBench Code Completion ---------', # category
'LongBench_lcc',
'LongBench_repobench-p',
'--------- LongBench Synthetic Tasks ---------', # category
'LongBench_passage_retrieval_en',
'LongBench_passage_count',
'LongBench_passage_retrieval_en',
'LongBench_passage_retrieval_zh',
'--------- LongBench Code Completion ---------', # category
'LongBench_lcc',
'LongBench_repobench-p',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
prompt_db=dict(
......
from .evaluators import LEvalEMEvaluator # noqa: F401, F403
from .evaluators import LEvalGPTEvaluator # noqa: F401, F403
from .leval_coursera import * # noqa: F401, F403
from .leval_financial_qa import * # noqa: F401, F403
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment