Unverified Commit 86d5ec3d authored by Leymore's avatar Leymore Committed by GitHub
Browse files

Update configs (#9)

* Update implements

* Update
parent 2d0b184b
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .summedits_gen_4fb38b import summedits_datasets # noqa: F401, F403 from .summedits_gen_315438 import summedits_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import SummeditsDataset_V2
from opencompass.utils.text_postprocessors import first_capital_postprocess
summedits_reader_cfg = dict(
input_columns=['doc', 'summary'], output_column='label')
summedits_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN",
prompt=
"""Given the document below, you have to determine if "Yes" or "No", the summary is factually consistent with the document.
Document:
{doc}
Summary:
{summary}
Question:
Is the summary factually consistent with the document?
A. Yes
B. No
Answer:"""
),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
summedits_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role="BOT",
pred_postprocessor=dict(type=first_capital_postprocess),
)
summedits_datasets = [
dict(
abbr='summedits',
type=SummeditsDataset_V2,
path='./data/summedits/summedits.jsonl',
reader_cfg=summedits_reader_cfg,
infer_cfg=summedits_infer_cfg,
eval_cfg=summedits_eval_cfg)
]
...@@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever ...@@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import SummeditsDataset_V2 from opencompass.datasets import SummeditsDataset_V2
from opencompass.utils.text_postprocessors import first_capital_postprocess
summedits_reader_cfg = dict( summedits_reader_cfg = dict(
input_columns=['doc', 'summary'], output_column='label') input_columns=['doc', 'summary'], output_column='label')
...@@ -23,7 +24,7 @@ summedits_infer_cfg = dict( ...@@ -23,7 +24,7 @@ summedits_infer_cfg = dict(
summedits_eval_cfg = dict( summedits_eval_cfg = dict(
evaluator=dict(type=AccEvaluator), evaluator=dict(type=AccEvaluator),
pred_role="BOT", pred_role="BOT",
pred_postprocessor=dict(type="first-capital"), pred_postprocessor=dict(type=first_capital_postprocess),
) )
summedits_datasets = [ summedits_datasets = [
......
...@@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever ...@@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.openicl.icl_evaluator import BleuEvaluator
from opencompass.datasets import SummScreenDataset from opencompass.datasets import SummScreenDataset
from opencompass.utils.text_postprocessors import general_cn_postprocess
summscreen_reader_cfg = dict( summscreen_reader_cfg = dict(
input_columns='content', input_columns='content',
...@@ -33,8 +34,8 @@ summscreen_infer_cfg = dict( ...@@ -33,8 +34,8 @@ summscreen_infer_cfg = dict(
summscreen_eval_cfg = dict( summscreen_eval_cfg = dict(
evaluator=dict(type=BleuEvaluator), evaluator=dict(type=BleuEvaluator),
pred_role='BOT', pred_role='BOT',
pred_postprocessor=dict(type='general_cn'), pred_postprocessor=dict(type=general_cn_postprocess),
dataset_postprocessor=dict(type='general_cn')) dataset_postprocessor=dict(type=general_cn_postprocess))
summscreen_datasets = [ summscreen_datasets = [
dict( dict(
......
...@@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever ...@@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.openicl.icl_evaluator import BleuEvaluator
from opencompass.datasets import SummScreenDataset from opencompass.datasets import SummScreenDataset
from opencompass.utils.text_postprocessors import general_cn_postprocess
summscreen_reader_cfg = dict( summscreen_reader_cfg = dict(
input_columns='content', input_columns='content',
...@@ -21,8 +22,8 @@ summscreen_infer_cfg = dict( ...@@ -21,8 +22,8 @@ summscreen_infer_cfg = dict(
summscreen_eval_cfg = dict( summscreen_eval_cfg = dict(
evaluator=dict(type=BleuEvaluator), evaluator=dict(type=BleuEvaluator),
pred_postprocessor=dict(type='general_cn'), pred_postprocessor=dict(type=general_cn_postprocess),
dataset_postprocessor=dict(type='general_cn')) dataset_postprocessor=dict(type=general_cn_postprocess))
summscreen_datasets = [ summscreen_datasets = [
dict( dict(
......
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .triviaqa_gen_3e39a5 import triviaqa_datasets # noqa: F401, F403 from .triviaqa_gen_2121ce import triviaqa_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import TriviaQADataset, TriviaQAEvaluator
triviaqa_reader_cfg = dict(
input_columns=['question'],
output_column='answer',
train_split='dev',
test_split='dev')
triviaqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Answer these questions, your answer should be as simple as possible, start your answer with the prompt \'The answer is \'.\nQ: {question}?'),
dict(role='BOT', prompt='A:'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50))
triviaqa_eval_cfg = dict(
evaluator=dict(type=TriviaQAEvaluator), pred_role='BOT')
triviaqa_datasets = [
dict(
type=TriviaQADataset,
abbr='triviaqa',
path='./data/triviaqa/',
reader_cfg=triviaqa_reader_cfg,
infer_cfg=triviaqa_infer_cfg,
eval_cfg=triviaqa_eval_cfg)
]
...@@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever ...@@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset_V2 from opencompass.datasets import winograndeDataset_V2
from opencompass.utils.text_postprocessors import first_capital_postprocess
winogrande_reader_cfg = dict( winogrande_reader_cfg = dict(
input_columns=["opt1", "opt2"], input_columns=["opt1", "opt2"],
...@@ -27,7 +28,7 @@ winogrande_infer_cfg = dict( ...@@ -27,7 +28,7 @@ winogrande_infer_cfg = dict(
winogrande_eval_cfg = dict( winogrande_eval_cfg = dict(
evaluator=dict(type=AccEvaluator), evaluator=dict(type=AccEvaluator),
pred_role="BOT", pred_role="BOT",
pred_postprocessor=dict(type="first-capital"), pred_postprocessor=dict(type=first_capital_postprocess),
) )
winogrande_datasets = [ winogrande_datasets = [
......
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .winogrande_ppl_18e5de import winogrande_datasets # noqa: F401, F403 from .winogrande_ppl_55a66e import winogrande_datasets # noqa: F401, F403
...@@ -15,9 +15,9 @@ winogrande_infer_cfg = dict( ...@@ -15,9 +15,9 @@ winogrande_infer_cfg = dict(
type=PromptTemplate, type=PromptTemplate,
template={ template={
i: dict(round=[ i: dict(round=[
dict(role="HUMAN", prompt=f"Good sentence: {{opt{i+1}}}"), dict(role="HUMAN", prompt=f"Good sentence: {{opt{i}}}"),
]) ])
for i in range(2) for i in range(1, 3)
}), }),
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer)) inferencer=dict(type=PPLInferencer))
......
from opencompass.models import HuggingFaceCausalLM from opencompass.models import HuggingFaceCausalLM
from mmengine.config import read_base
with read_base(): with read_base():
from .datasets.piqa.piqa_ppl import piqa_datasets from .datasets.piqa.piqa_ppl import piqa_datasets
datasets = piqa_datasets datasets = piqa_datasets
...@@ -24,5 +26,3 @@ models = [ ...@@ -24,5 +26,3 @@ models = [
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
) )
] ]
...@@ -2,4 +2,5 @@ GaokaoBench_summary_groups = [] ...@@ -2,4 +2,5 @@ GaokaoBench_summary_groups = []
# gaokao-bench # gaokao-bench
_GaokaoBench_weights = {'2010-2022_Math_II_MCQs': 1090, '2010-2022_Math_I_MCQs': 1070, '2010-2022_History_MCQs': 1148, '2010-2022_Biology_MCQs': 900, '2010-2022_Political_Science_MCQs': 1280, '2010-2022_Physics_MCQs': 384, '2010-2022_Chemistry_MCQs': 744, '2010-2013_English_MCQs': 105, '2010-2022_Chinese_Modern_Lit': 261, '2010-2022_English_Fill_in_Blanks': 900.0, '2012-2022_English_Cloze_Test': 260, '2010-2022_Geography_MCQs': 380, '2010-2022_English_Reading_Comp': 940, '2010-2022_Chinese_Lang_and_Usage_MCQs': 240} _GaokaoBench_weights = {'2010-2022_Math_II_MCQs': 1090, '2010-2022_Math_I_MCQs': 1070, '2010-2022_History_MCQs': 1148, '2010-2022_Biology_MCQs': 900, '2010-2022_Political_Science_MCQs': 1280, '2010-2022_Physics_MCQs': 384, '2010-2022_Chemistry_MCQs': 744, '2010-2013_English_MCQs': 105, '2010-2022_Chinese_Modern_Lit': 261, '2010-2022_English_Fill_in_Blanks': 900.0, '2012-2022_English_Cloze_Test': 260, '2010-2022_Geography_MCQs': 380, '2010-2022_English_Reading_Comp': 940, '2010-2022_Chinese_Lang_and_Usage_MCQs': 240}
_GaokaoBench_weights = {'GaokaoBench_' + k: v for k, v in _GaokaoBench_weights.items()}
GaokaoBench_summary_groups.append({'name': 'GaokaoBench', 'subsets': list(_GaokaoBench_weights.keys()), 'weights': _GaokaoBench_weights}) GaokaoBench_summary_groups.append({'name': 'GaokaoBench', 'subsets': list(_GaokaoBench_weights.keys()), 'weights': _GaokaoBench_weights})
...@@ -23,3 +23,9 @@ for _lang_serie in _flores_lang_map: ...@@ -23,3 +23,9 @@ for _lang_serie in _flores_lang_map:
'name': f'flores_100_English_{_lang_serie}', 'name': f'flores_100_English_{_lang_serie}',
'subsets': [f'flores_100_eng-{lang_name}' for lang_name in _flores_lang_map[_lang_serie]] 'subsets': [f'flores_100_eng-{lang_name}' for lang_name in _flores_lang_map[_lang_serie]]
}) })
flores_summary_groups.append({
'name': 'flores_100',
'subsets': [f'flores_100_{lang_name}-eng' for lang_name in sum(_flores_lang_map.values(), [])] + \
[f'flores_100_eng-{lang_name}' for lang_name in sum(_flores_lang_map.values(), [])]
})
jigsaw_multilingual_summary_groups = []
# bbh
_jigsaw_multilingual = ['es', 'fr', 'it', 'pt', 'ru', 'tr']
_jigsaw_multilingual = ['jigsaw_multilingual_' + s for s in _jigsaw_multilingual]
jigsaw_multilingual_summary_groups.append({'name': 'jigsaw_multilingual', 'subsets': _jigsaw_multilingual})
...@@ -7,77 +7,85 @@ with read_base(): ...@@ -7,77 +7,85 @@ with read_base():
from .groups.bbh import bbh_summary_groups from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups from .groups.GaokaoBench import GaokaoBench_summary_groups
from .groups.flores import flores_summary_groups from .groups.flores import flores_summary_groups
from .groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups
summarizer = dict( summarizer = dict(
dataset_abbrs = [ dataset_abbrs = [
'--- Exam ---', '--------- 考试 Exam ---------', # category
'agieval', # 'Mixed', # subcategory
'mmlu-all-set',
"ceval", "ceval",
'agieval',
'mmlu',
"GaokaoBench", "GaokaoBench",
"bbh", 'ARC-c',
'--- Coding ---', '--------- 语言 Language ---------', # category
'openai_humaneval', # '字词释义', # subcategory
'mbpp', 'WiC',
'--- ChineseUniversal ---', 'summedits',
'C3', # '成语习语', # subcategory
'CMRC_dev', 'chid-dev',
'DRCD_dev', # '语义相似度', # subcategory
'afqmc-dev', 'afqmc-dev',
'cmnli',
'ocnli',
'bustm-dev', 'bustm-dev',
'chid-dev', # '指代消解', # subcategory
'cluewsc-dev', 'cluewsc-dev',
'csl_dev', 'WSC',
'eprstmt-dev', 'winogrande',
# '翻译', # subcategory
'flores_100',
'--------- 知识 Knowledge ---------', # category
# '知识问答', # subcategory
'BoolQ',
'commonsense_qa',
'nq',
'triviaqa',
# '多语种问答', # subcategory
'--------- 推理 Reasoning ---------', # category
# '文本蕴含', # subcategory
'cmnli',
'ocnli',
'ocnli_fc-dev', 'ocnli_fc-dev',
'tnews-dev',
'lcsts',
'--- Completion ---',
'lambada',
'story_cloze',
'--- EnglishUniversal ---',
'AX_b', 'AX_b',
'AX_g', 'AX_g',
'BoolQ',
'CB', 'CB',
'COPA',
'MultiRC',
'RTE', 'RTE',
# '常识推理', # subcategory
'story_cloze',
'COPA',
'ReCoRD', 'ReCoRD',
'WiC',
'WSC',
'race-high',
'race-middle',
'--- NLG ---',
'Xsum',
'--- Reasoning ---',
'gsm8k',
'summedits',
'math',
'TheoremQA',
'--- QA ---',
'hellaswag', 'hellaswag',
'ARC-e',
'ARC-c',
'commonsense_qa',
'piqa', 'piqa',
'siqa', 'siqa',
'strategyqa', 'strategyqa',
'winogrande', # '数学推理', # subcategory
'openbookqa', 'math',
'gsm8k',
# '定理应用', # subcategory
'TheoremQA',
# '代码', # subcategory
'openai_humaneval',
'mbpp',
# '综合推理', # subcategory
"bbh",
'--------- 理解 Understanding ---------', # category
# '阅读理解', # subcategory
'C3',
'CMRC_dev',
'DRCD_dev',
'MultiRC',
'race-middle',
'race-high',
'openbookqa_fact', 'openbookqa_fact',
'nq', # '内容总结', # subcategory
'triviaqa', 'csl_dev',
'--- Translation ---', 'lcsts',
'flores_100_Indo-European-Germanic_English', 'Xsum',
'flores_100_English_Indo-European-Germanic', # '内容分析', # subcategory
'flores_100_Indo-European-Romance_English', 'eprstmt-dev',
'flores_100_English_Indo-European-Romance', 'lambada',
'flores_100_zho_simpl-eng', 'tnews-dev',
'flores_100_eng-zho_simpl', '--------- 安全 Safety ---------', # category
'--- Security ---', # '偏见', # subcategory
'crows_pairs', 'crows_pairs',
], ],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
......
...@@ -11,7 +11,7 @@ with read_base(): ...@@ -11,7 +11,7 @@ with read_base():
summarizer = dict( summarizer = dict(
dataset_abbrs = [ dataset_abbrs = [
'--- Exam ---', '--- Exam ---',
'mmlu-all-set', 'mmlu',
"ceval", "ceval",
"bbh", "bbh",
'--- ChineseUniversal ---', '--- ChineseUniversal ---',
......
...@@ -24,7 +24,7 @@ If you want to perform evaluations on the humaneval dataset, follow these steps. ...@@ -24,7 +24,7 @@ If you want to perform evaluations on the humaneval dataset, follow these steps.
``` ```
git clone https://github.com/openai/human-eval.git git clone https://github.com/openai/human-eval.git
cd human-eval cd human-eval
pip install -r requirments.txt pip install -r requirements.txt
pip install -e . pip install -e .
cd .. cd ..
``` ```
......
...@@ -45,11 +45,10 @@ We use the following tools for linting and formatting: ...@@ -45,11 +45,10 @@ We use the following tools for linting and formatting:
Style configurations of yapf and isort can be found in [setup.cfg](https://github.com/open-mmlab/OpenCompass/blob/main/setup.cfg). Style configurations of yapf and isort can be found in [setup.cfg](https://github.com/open-mmlab/OpenCompass/blob/main/setup.cfg).
## Pre-commit Hook ## Pre-commit Hook
We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit. fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirements.txt` automatically on every commit.
The config for a pre-commit hook is stored in [.pre-commit-config](xxxxxxx). The config for a pre-commit hook is stored in [.pre-commit-config](xxxxxxx).
After you clone the repository, you will need to install initialize pre-commit hook. After you clone the repository, you will need to install initialize pre-commit hook.
...@@ -66,4 +65,4 @@ pre-commit install ...@@ -66,4 +65,4 @@ pre-commit install
After this on every commit check code linters and formatter will be enforced. After this on every commit check code linters and formatter will be enforced.
> Before you create a PR, make sure that your code lints and is formatted by yapf. > Before you create a PR, make sure that your code lints and is formatted by yapf.
\ No newline at end of file
...@@ -24,7 +24,7 @@ pip install -e . ...@@ -24,7 +24,7 @@ pip install -e .
``` ```
git clone https://github.com/openai/human-eval.git git clone https://github.com/openai/human-eval.git
cd human-eval cd human-eval
pip install -r requirments.txt pip install -r requirements.txt
pip install -e . pip install -e .
cd .. cd ..
``` ```
......
...@@ -48,7 +48,7 @@ yapf和isort的样式配置可以在[setup.cfg](https://github.com/OpenCompass/b ...@@ -48,7 +48,7 @@ yapf和isort的样式配置可以在[setup.cfg](https://github.com/OpenCompass/b
## 预提交钩子 (Pre-commit Hook) ## 预提交钩子 (Pre-commit Hook)
我们使用[预提交钩子](https://pre-commit.com/)用于在每次提交时自动检查与格式化`flake8``yapf``isort``trailing whitespaces``markdown files` 我们使用[预提交钩子](https://pre-commit.com/)用于在每次提交时自动检查与格式化`flake8``yapf``isort``trailing whitespaces``markdown files`
修复`end-of-files``double-quoted-strings``python-encoding-pragma``mixed-line-ending`,并自动排序`requirments.txt`。预提交钩子的配置存储在[.pre-commit-config]()中。 修复`end-of-files``double-quoted-strings``python-encoding-pragma``mixed-line-ending`,并自动排序`requirements.txt`。预提交钩子的配置存储在[.pre-commit-config](<>)中。
在你克隆仓库后,你需要安装并初始化预提交钩子。 在你克隆仓库后,你需要安装并初始化预提交钩子。
...@@ -64,4 +64,4 @@ pre-commit install ...@@ -64,4 +64,4 @@ pre-commit install
之后,在每次提交时都会强制执行代码 linters 和格式化器。 之后,在每次提交时都会强制执行代码 linters 和格式化器。
> 在你创建PR前,确保你的代码通过了 lint 检查并被 yapf 格式化。 > 在你创建PR前,确保你的代码通过了 lint 检查并被 yapf 格式化。
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment