Unverified Commit bf79ff1c authored by Tong Gao's avatar Tong Gao Committed by GitHub
Browse files

[Feature] Add LEval datasets


Co-authored-by: default avatarkennymckormick <dhd@pku.edu.cn>
parent 8d9cee06
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .bbh_gen_5b92b0 import bbh_datasets # noqa: F401, F403 from .bbh_gen_6bd693 import bbh_datasets # noqa: F401, F403
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .cmmlu_gen_ffe7c0 import cmmlu_datasets # noqa: F401, F403 from .cmmlu_gen_c13365 import cmmlu_datasets # noqa: F401, F403
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .cmmlu_ppl_fd1f2f import cmmlu_datasets # noqa: F401, F403 from .cmmlu_ppl_8b9c76 import cmmlu_datasets # noqa: F401, F403
...@@ -5,7 +5,7 @@ with read_base(): ...@@ -5,7 +5,7 @@ with read_base():
from ..ceval.ceval_ppl_578f8d import ceval_datasets from ..ceval.ceval_ppl_578f8d import ceval_datasets
from ..agieval.agieval_mixed_2f14ad import agieval_datasets from ..agieval.agieval_mixed_2f14ad import agieval_datasets
from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..bbh.bbh_gen_6bd693 import bbh_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
......
...@@ -2,7 +2,7 @@ from mmengine.config import read_base ...@@ -2,7 +2,7 @@ from mmengine.config import read_base
with read_base(): with read_base():
from ..ceval.ceval_ppl_578f8d import ceval_datasets from ..ceval.ceval_ppl_578f8d import ceval_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..bbh.bbh_gen_6bd693 import bbh_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
......
...@@ -3,9 +3,9 @@ from mmengine.config import read_base ...@@ -3,9 +3,9 @@ from mmengine.config import read_base
with read_base(): with read_base():
from ..mmlu.mmlu_gen_a484b3 import mmlu_datasets from ..mmlu.mmlu_gen_a484b3 import mmlu_datasets
from ..ceval.ceval_gen_5f30c7 import ceval_datasets from ..ceval.ceval_gen_5f30c7 import ceval_datasets
from ..agieval.agieval_gen_397d81 import agieval_datasets from ..agieval.agieval_gen_64afd3 import agieval_datasets
from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..bbh.bbh_gen_6bd693 import bbh_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
......
...@@ -3,7 +3,7 @@ from mmengine.config import read_base ...@@ -3,7 +3,7 @@ from mmengine.config import read_base
with read_base(): with read_base():
from ..mmlu.mmlu_gen_a484b3 import mmlu_datasets from ..mmlu.mmlu_gen_a484b3 import mmlu_datasets
from ..ceval.ceval_gen_5f30c7 import ceval_datasets from ..ceval.ceval_gen_5f30c7 import ceval_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..bbh.bbh_gen_6bd693 import bbh_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
...@@ -35,6 +35,6 @@ with read_base(): ...@@ -35,6 +35,6 @@ with read_base():
from ..obqa.obqa_gen_9069e4 import obqa_datasets from ..obqa.obqa_gen_9069e4 import obqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets from ..nq.nq_gen_c788f6 import nq_datasets
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from ..crowspairs.crowspairs_gen_21f7cb import crowspairs_datasets from ..crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .crowspairs_gen_21f7cb import crowspairs_datasets # noqa: F401, F403 from .crowspairs_gen_381af0 import crowspairs_datasets # noqa: F401, F403
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .cvalues_responsibility_gen_4aec9f import cvalues_datasets # noqa: F401, F403 from .cvalues_responsibility_gen_543378 import cvalues_datasets # noqa: F401, F403
from opencompass.models import HuggingFaceCausalLM
import torch
# long context evaluation tasks
with read_base():
from .datasets.LEvalNaturalQuestion.LEval_naturalquestion_gen import LEval_nq_datasets
from .datasets.LEvalNarrativeQA.LEval_narrativeqa_gen import LEval_narrativeqa_datasets
from .datasets.LEvalMultidocQA.LEval_multidocqa_gen import LEval_multidocqa_datasets
from .datasets.LEvalCoursera.LEval_coursera_gen import LEval_coursera_datasets
from .datasets.LEvalTPO.LEval_tpo_gen import LEval_tpo_datasets
from .datasets.LEvalQuality.LEval_quality_gen import LEval_quality_datasets
from .datasets.LEvalGSM100.LEval_gsm100_gen import LEval_gsm100_datasets
from .datasets.LEvalTopicRetrieval.LEval_topic_retrieval_gen import LEval_tr_datasets
from .datasets.LEvalFinancialQA.LEval_financialqa_gen import LEval_financialqa_datasets
from .datasets.LEvalGovReportSumm.LEval_gov_report_summ_gen import LEval_govreport_summ_datasets
from .datasets.LEvalLegalContractQA.LEval_legalcontractqa_gen import LEval_legalqa_datasets
from .datasets.LEvalMeetingSumm.LEval_meetingsumm_gen import LEval_meetingsumm_datasets
from .datasets.LEvalNewsSumm.LEval_newssumm_gen import LEval_newssumm_datasets
from .datasets.LEvalPaperAssistant.LEval_paper_assistant_gen import LEval_ps_summ_datasets
from .datasets.LEvalPatentSumm.LEval_patent_summ_gen import LEval_patent_summ_datasets
from .datasets.LEvalTVShowSumm.LEval_tvshow_summ_gen import LEval_tvshow_summ_datasets
from .datasets.LEvalScientificQA.LEval_scientificqa_gen import LEval_scientificqa_datasets
from .datasets.LEvalReviewSumm.LEval_review_summ_gen import LEval_review_summ_datasets
# choose a model of interest
# ininternlm as an example
from .models.hf_internlm_7b import models
# and output the results in a choosen format
from .summarizers.LEval import summarizer
datasets = [*LEval_coursera_datasets,
*LEval_tpo_datasets,
*LEval_quality_datasets,
*LEval_gsm100_datasets,
*LEval_tr_datasets,
*LEval_financialqa_datasets,
*LEval_govreport_summ_datasets,
*LEval_legalqa_datasets,
*LEval_meetingsumm_datasets,
*LEval_multidocqa_datasets,
*LEval_narrativeqa_datasets,
*LEval_nq_datasets,
*LEval_newssumm_datasets,
*LEval_newssumm_datasets,
*LEval_patent_summ_datasets,
*LEval_tvshow_summ_datasets,
*LEval_scientificqa_datasets,
*LEval_review_summ_datasets,
*LEval_ps_summ_datasets]
summarizer = dict(
dataset_abbrs = [
'--------- LEval Exact Match (Acc) ---------', # category
"LEval_coursera",
'LEval_gsm100',
'LEval_quality',
"LEval_tpo",
'LEval_topic_retrieval',
'--------- LEval Gen (ROUGE) ---------', # category
'LEval_financialqa',
'LEval_gov_report_summ',
'LEval_legal_contract_qa',
'LEval_meeting_summ',
'LEval_multidocqa',
'LEval_narrativeqa',
'LEval_nq',
'LEval_news_summ',
'LEval_paper_assistant',
'LEval_patent_summ',
'LEval_review_summ',
'LEval_scientificqa',
'LEval_tvshow_summ'
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
prompt_db=dict(
database_path='configs/datasets/log.json',
config_dir='configs/datasets',
blacklist='.promptignore'),
)
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class LEvalCourseraDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
instructions = dataset[split]['instructions'][i]
outputs = dataset[split]['outputs'][i]
context = dataset[split]['input'][i]
for question, answer in zip(instructions, outputs):
raw_data.append({
'question': question,
'context': context,
'answer': answer
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class LEvalFinancialQADataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
instructions = dataset[split]['instructions'][i]
outputs = dataset[split]['outputs'][i]
context = dataset[split]['input'][i]
for question, answer in zip(instructions, outputs):
raw_data.append({
'question': question,
'context': context,
'answer': answer
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class LEvalGovReportSummDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
instructions = dataset[split]['instructions'][i]
outputs = dataset[split]['outputs'][i]
context = dataset[split]['input'][i]
for question, answer in zip(instructions, outputs):
raw_data.append({
'question': question,
'context': context,
'answer': answer
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment