Unverified Commit b4afe3e7 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Add InternLM2 Keyset Evaluation Demo (#807)


Co-authored-by: default avatarzhangyifan1 <zhangyifan1@pjlab.org.cn>
parent acae5609
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='Question:\n', end='\n'),
dict(role="BOT", begin="Answer:\n", end='\n', generate=True),
],
)
models = [
dict(
abbr='abel-7b-002',
type=HuggingFaceCausalLM,
path='GAIR/Abel-7B-002',
tokenizer_path='GAIR/Abel-7B-002',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin='',
round=[
dict(role="HUMAN", begin='Question: ', end='\n\n'),
dict(role="BOT", begin="Answer: ", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='arithmo-mistral-7b-hf',
type=HuggingFaceCausalLM,
path='akjindal53244/Arithmo-Mistral-7B',
tokenizer_path='akjindal53244/Arithmo-Mistral-7B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n',
round=[
dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
dict(role="BOT", begin="### Response:", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='gsm8k-rft-llama7b2-u13b',
type=HuggingFaceCausalLM,
path='OFA-Sys/gsm8k-rft-llama7b2-u13b',
tokenizer_path='OFA-Sys/gsm8k-rft-llama7b2-u13b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
round=[
dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
dict(role="BOT", begin="### Response: ", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='metamath-7b-v1.0-hf',
type=HuggingFaceCausalLM,
path='meta-math/MetaMath-7B-V1.0',
tokenizer_path='meta-math/MetaMath-7B-V1.0',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
round=[
dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
dict(role="BOT", begin="### Response: ", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='metamath-llemma-7b-hf',
type=HuggingFaceCausalLM,
path='meta-math/MetaMath-Llemma-7B',
tokenizer_path='meta-math/MetaMath-Llemma-7B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
round=[
dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
dict(role="BOT", begin="### Response: ", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='metamath-mistral-7b-hf',
type=HuggingFaceCausalLM,
path='meta-math/MetaMath-Mistral-7B',
tokenizer_path='meta-math/MetaMath-Mistral-7B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
type=HuggingFaceCausalLM,
abbr='phi-2-hf',
path='microsoft/phi-2',
tokenizer_path='microsoft/phi-2',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=100,
min_out_len=3,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<_user>'),
dict(role="BOT", begin="<_bot>", end='<_end>', generate=True),
],
eos_token_id=160133
)
models = [
dict(
abbr='telechat-7b-hf',
type=HuggingFaceCausalLM,
path='Tele-AI/telechat-7B',
tokenizer_path='Tele-AI/telechat-7B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<_end>',
)
]
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
abbr='yayi2-30b-hf',
type=HuggingFaceCausalLM,
path='wenge-research/yayi2-30b',
tokenizer_path='wenge-research/yayi2-30b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=100,
min_out_len=3,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardmath-7b-v1.0-hf',
path='WizardLM/WizardMath-7B-V1.0',
tokenizer_path='WizardLM/WizardMath-7B-V1.0',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardmath-7b-v1.1-hf',
path='WizardLM/WizardMath-7B-V1.1',
tokenizer_path='WizardLM/WizardMath-7B-V1.1',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]
......@@ -16,7 +16,7 @@ models = [
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
batch_size=1,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),
......
......@@ -11,8 +11,8 @@ agent_summary_groups = [
dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
dict(
name='agent',
subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10'],
weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1}
subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10', 'plugin_eval-p10_zh'],
weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1, 'plugin_eval-p10_zh': 1}
)
]
......@@ -48,13 +48,26 @@ summarizer = dict(
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
['plugin_eval-p10-reason_str_v1', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v1', 'thought'],
['plugin_eval-p10-retrieve_str_v1', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v1', 'name'],
['plugin_eval-p10-understand_str_v1', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-p10-review_str_v1', 'review_quality'],
['plugin_eval-p10_zh', 'naive_average'],
['plugin_eval-p10-instruct_v1_zh', 'format_metric'],
['plugin_eval-p10-instruct_v1_zh', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1_zh', 'f1_score'],
['plugin_eval-p10-plan_json_v1_zh', 'f1_score'],
['plugin_eval-p10-reason_str_v1_zh', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
['plugin_eval-p10-retrieve_str_v1_zh', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'name'],
['plugin_eval-p10-understand_str_v1_zh', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'args'],
['plugin_eval-p10-review_str_v1_zh', 'review_quality'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
......
leval_summary_groups = [
{"name": "leval", "subsets": ["LEval_coursera", "LEval_gsm100", "LEval_quality", "LEval_tpo", "LEval_topic_retrieval", "LEval_financialqa", "LEval_gov_report_summ", "LEval_legal_contract_qa", "LEval_meeting_summ", "LEval_multidocqa", "LEval_narrativeqa", "LEval_nq", "LEval_news_summ", "LEval_paper_assistant", "LEval_patent_summ", "LEval_review_summ", "LEval_scientificqa", "LEval_tvshow_summ"]},
]
longbench_summary_groups = [
{'name': 'longbench_single-document-qa', 'subsets': ['LongBench_narrativeqa', 'LongBench_qasper', 'LongBench_multifieldqa_en', 'LongBench_multifieldqa_zh']},
{'name': 'longbench_multi-document-qa', 'subsets': ['LongBench_hotpotqa', 'LongBench_2wikimqa', 'LongBench_musique', 'LongBench_dureader']},
{'name': 'longbench_summarization', 'subsets': ['LongBench_gov_report', 'LongBench_qmsum', 'LongBench_multi_news', 'LongBench_vcsum']},
{'name': 'longbench_few-shot-learning', 'subsets': ['LongBench_trec', 'LongBench_triviaqa', 'LongBench_samsum', 'LongBench_lsht']},
{'name': 'longbench_synthetic-tasks', 'subsets': ['LongBench_passage_count', 'LongBench_passage_retrieval_en', 'LongBench_passage_retrieval_zh']},
{'name': 'longbench_code-completion', 'subsets': ['LongBench_lcc', 'LongBench_repobench-p']},
{'name': 'longbench_code-completion', 'subsets': ['LongBench_lcc', 'LongBench_repobench-p']},
{'name': 'longbench', 'subsets': ['longbench_single-document-qa', 'longbench_multi-document-qa', 'longbench_summarization', 'longbench_few-shot-learning', 'longbench_synthetic-tasks', 'longbench_code-completion', 'longbench_code-completion']},
]
......@@ -66,9 +66,9 @@ naive_mathbench_summary_groups = [
{
'name': 'mathbench-circular-and-cloze',
'subsets': [
'mathbench-college-circular',
'mathbench-high-circular',
'mathbench-middle-circular',
'mathbench-circular',
'mathbench-college-cloze_en',
'mathbench-primary-cloze_cn',
],
......
......@@ -65,9 +65,9 @@ mathbench_agent_summary_groups = [
{
'name': 'mathbench-circular-and-cloze-agent',
'subsets': [
'mathbench-college-circular-agent',
'mathbench-high-circular-agent',
'mathbench-middle-circular-agent',
'mathbench-circular-agent',
'mathbench-college-cloze_en-agent',
'mathbench-primary-cloze_cn-agent',
],
......
plugineval_summary_groups = [
from copy import deepcopy
_base_summary_groups = [
{
'name': 'plugin_eval-instruct_v1',
'metric': 'format_metric',
......@@ -22,47 +24,41 @@ plugineval_summary_groups = [
['plugin_eval-instruct_v1', 'args_em_metric'],
['plugin_eval-plan_str_v1', 'f1_score'],
['plugin_eval-plan_json_v1', 'f1_score'],
['plugin_eval-reason_str_v2', 'thought'],
['plugin_eval-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-retrieve_str_v2', 'name'],
['plugin_eval-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-understand_str_v2', 'args'],
['plugin_eval-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-review_str_v6', 'review_quality'],
]
},
# special treatment for first 10% data points
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'format_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_format_metric'],
['plugin_eval-p10-instruct_v1', 'json_format_metric'],
]
},
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'args_em_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_args_em_metric'],
['plugin_eval-p10-instruct_v1', 'json_args_em_metric'],
]
},
{
'name': 'plugin_eval-p10',
'subsets': [
['plugin_eval-p10-instruct_v1', 'format_metric'],
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
['plugin_eval-reason_str_v1', 'thought'],
['plugin_eval-reason_retrieve_understand_json_v1', 'thought'],
['plugin_eval-retrieve_str_v1', 'name'],
['plugin_eval-reason_retrieve_understand_json_v1', 'name'],
['plugin_eval-understand_str_v1', 'args'],
['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-review_str_v1', 'review_quality'],
]
},
]
plugineval_summary_groups = []
# base
for group in _base_summary_groups:
group = deepcopy(group)
plugineval_summary_groups.append(group)
# base _zh
for group in _base_summary_groups:
group = deepcopy(group)
group['name'] = group['name'] + '_zh'
group['subsets'] = [[subset[0] + '_zh', subset[1]] for subset in group['subsets']]
plugineval_summary_groups.append(group)
# base -p10-
for group in _base_summary_groups:
group = deepcopy(group)
group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-p10')
group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-p10'), subset[1]] for subset in group['subsets']]
plugineval_summary_groups.append(group)
# base -p10- _zh
for group in _base_summary_groups:
group = deepcopy(group)
group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-p10') + '_zh'
group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-p10') + '_zh', subset[1]] for subset in group['subsets']]
plugineval_summary_groups.append(group)
from mmengine.config import read_base
with read_base():
from .groups.agieval import agieval_summary_groups
from .groups.mmlu import mmlu_summary_groups
from .groups.bbh import bbh_summary_groups
summarizer = dict(
dataset_abbrs=[
['mmlu', 'naive_average'],
['agieval', 'naive_average'],
['bbh', 'naive_average'],
['gsm8k', 'accuracy'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []),
)
summarizer = dict(
dataset_abbrs = [
'--------- LEval Exact Match (Acc) ---------', # category
"LEval_coursera",
'LEval_coursera',
'LEval_gsm100',
'LEval_quality',
"LEval_tpo",
'LEval_tpo',
'LEval_topic_retrieval',
'--------- LEval Gen (ROUGE) ---------', # category
'LEval_financialqa',
......@@ -21,5 +21,5 @@ summarizer = dict(
'LEval_scientificqa',
'LEval_tvshow_summ'
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment