"...composable_kernel_rocm.git" did not exist on "6e28a8ac64af00418b95c681645690cb16633ab9"
Unverified Commit b03d5dc5 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Sync Internal (#941)

parent bbec7d87
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever, RandomRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import NQOpenDataset, NQEvaluator
nq_datasets = []
for k in [1]:
nq_reader_cfg = dict(
input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
if k == 0:
nq_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}?'),
dict(role='BOT', prompt='A:'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50)
)
else:
nq_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}?'),
dict(role='BOT', prompt='A: {answer}.\n'),
]
),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(role='HUMAN', prompt='Q: {question}?'),
dict(role='BOT', prompt='A:'),
]
),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
)
nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT")
nq_datasets.append(
dict(
type=NQOpenDataset,
abbr=f'nq_open_{k}shot',
path='./data/nq-open/',
reader_cfg=nq_reader_cfg,
infer_cfg=nq_infer_cfg,
eval_cfg=nq_eval_cfg)
)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import NQOpenDataset, NQEvaluator
nq_datasets = []
for k in [1]:
nq_reader_cfg = dict(
input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
if k == 0:
nq_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='Q: {question}\nA: ',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50)
)
else:
nq_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template='Q: {question}\nA: {answer}.\n',
),
prompt_template=dict(
type=PromptTemplate,
template='</E>Q: {question}\nA: ',
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
)
nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT")
nq_datasets.append(
dict(
type=NQOpenDataset,
abbr=f'nq_open_{k}shot',
path='./data/nq-open/',
reader_cfg=nq_reader_cfg,
infer_cfg=nq_infer_cfg,
eval_cfg=nq_eval_cfg)
)
......@@ -11,19 +11,12 @@ race_reader_cfg = dict(
test_split="test"
)
hint = "Read the article, and answer the question by replying A, B, C or D."
question_and_options = "{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
race_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template={
'A':
'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: A',
'B':
'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: B',
'C':
'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: C',
'D':
'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: D',
}),
template={answer: hint + '\n\n' + question_and_options + '\n\nAnswer: ' + answer for answer in ['A', 'B', 'C', 'D']}),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer))
......
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator
triviaqa_datasets = []
for k in [1]:
triviaqa_reader_cfg = dict(
input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
if k == 0:
triviaqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='Q: {question}\nA: ',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50)
)
else:
triviaqa_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template='Q: {question}\nA: {answer}.\n',
),
prompt_template=dict(
type=PromptTemplate,
template='</E>Q: {question}\nA: ',
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
)
triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role="BOT")
triviaqa_datasets.append(
dict(
type=TriviaQADataset_V2,
abbr=f'triviaqa_wiki_{k}shot',
path='./data/triviaqa',
reader_cfg=triviaqa_reader_cfg,
infer_cfg=triviaqa_infer_cfg,
eval_cfg=triviaqa_eval_cfg)
)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator
triviaqa_datasets = []
for k in [1]:
triviaqa_reader_cfg = dict(
input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
if k == 0:
triviaqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A:'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50)
)
else:
triviaqa_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}.\n'),
]
),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A:'),
]
),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
)
triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role="BOT")
triviaqa_datasets.append(
dict(
type=TriviaQADataset_V2,
abbr=f'triviaqa_wiki_{k}shot',
path='./data/triviaqa',
reader_cfg=triviaqa_reader_cfg,
infer_cfg=triviaqa_infer_cfg,
eval_cfg=triviaqa_eval_cfg)
)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset_V3
from opencompass.utils.text_postprocessors import first_option_postprocess
winogrande_reader_cfg = dict(
input_columns=["opt1", "opt2"],
output_column="answer",
train_split="train_xs",
test_split="dev",
)
winogrande_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(role="HUMAN", prompt="Which of the following is a good sentence:\nA. {opt1}\nB. {opt2}\nAnswer:"),
dict(role="BOT", prompt="{answer}"),
]
),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
inferencer=dict(type=GenInferencer),
)
winogrande_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role="BOT",
pred_postprocessor=dict(type=first_option_postprocess, options="AB"),
)
winogrande_datasets = [
dict(
abbr="winogrande",
type=winograndeDataset_V3,
path="./data/winogrande",
reader_cfg=winogrande_reader_cfg,
infer_cfg=winogrande_infer_cfg,
eval_cfg=winogrande_eval_cfg,
)
]
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import LLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset_V3
winogrande_reader_cfg = dict(
input_columns=['opt1', 'opt2'],
output_column='answer',
train_split="train_xs",
test_split="dev",
)
question_and_options = "Which of the following is a good sentence:\nA. {opt1}\nB. {opt2}"
winogrande_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={answer: f"{question_and_options}\nAnswer: {answer}\n" for answer in ["A", "B"]},
),
prompt_template=dict(
type=PromptTemplate,
template={answer: f"</E>{question_and_options}\nAnswer: {answer}" for answer in ["A", "B"]},
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
inferencer=dict(type=LLInferencer),
)
winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
winogrande_datasets = [
dict(
abbr='winogrande',
type=winograndeDataset_V3,
path='./data/winogrande',
reader_cfg=winogrande_reader_cfg,
infer_cfg=winogrande_infer_cfg,
eval_cfg=winogrande_eval_cfg)
]
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
type=HuggingFaceCausalLM,
abbr='gemma-2b-hf',
path="google/gemma-2b",
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
],
eos_token_id=151645,
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='gemma-2b-it-hf',
path="google/gemma-2b-it",
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
type=HuggingFaceCausalLM,
abbr='gemma-7b-hf',
path="google/gemma-7b",
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
],
eos_token_id=151645,
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='gemma-7b-it-hf',
path="google/gemma-7b-it",
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
meta_template=_meta_template,
min_out_len=1,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<用户>'),
dict(role="BOT", begin="<AI>", generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='minicpm-2b-dpo-hf',
path='openbmb/MiniCPM-2B-dpo-fp32',
tokenizer_path='openbmb/MiniCPM-2B-dpo-fp32',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<用户>',
)
]
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<用户>'),
dict(role="BOT", begin="<AI>", generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='minicpm-2b-sft-hf',
path='openbmb/MiniCPM-2B-sft-fp32',
tokenizer_path='openbmb/MiniCPM-2B-sft-fp32',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<用户>',
)
]
......@@ -20,6 +20,6 @@ models = [
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
run_cfg=dict(num_gpus=2, num_procs=1),
)
]
......@@ -4,8 +4,7 @@ from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n',
generate=True),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
],
eos_token_id=151645,
)
......
......@@ -5,101 +5,27 @@ with read_base():
from .groups.plugineval import plugineval_summary_groups
agent_summary_groups = [
dict(name='math_acc_1_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
dict(name='math_perf_4_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
dict(name='math_acc_1_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
dict(
name='agent',
subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10', 'plugin_eval-p10_zh'],
weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1, 'plugin_eval-p10_zh': 1}
)
# dict(name='math_acc_1_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
# dict(name='math_perf_4_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
# dict(name='math_acc_1_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
# dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
# dict(name='agent', subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10', 'plugin_eval-p10_zh'], weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1, 'plugin_eval-p10_zh': 1}),
dict(name='cibench_template', subsets=['cibench_template:executable', 'cibench_template:numeric_correct', 'cibench_template:text_score', 'cibench_template:vis_sim']),
dict(name='cibench_template_cn', subsets=['cibench_template_cn:executable', 'cibench_template_cn:numeric_correct', 'cibench_template_cn:text_score', 'cibench_template_cn:vis_sim']),
dict(name='agent_cn', subsets=['cibench_template_cn', 'plugin_eval-mus-p10_one_review_zh']),
dict(name='agent_en', subsets=['cibench_template', 'plugin_eval-mus-p10_one_review']),
dict(name='agent', subsets=['agent_cn', 'agent_en']),
]
summarizer = dict(
dataset_abbrs=[
# 'agent',
# 'math_acc_1_and_fill_in_blank-native',
# 'math_perf_4_and_fill_in_blank-native',
# # '######## MathBench-Agent Accuracy ########', # category
# 'math_acc_1_and_fill_in_blank-agent',
# 'math_perf_4_and_fill_in_blank-agent',
# # '######## CIBench Template ########', # category
# 'cibench_template:executable',
# 'cibench_template:numeric_correct',
# 'cibench_template:text_score',
# 'cibench_template:vis_sim',
# # '######## CIBench Template Chinese ########', # category
# 'cibench_template_cn:executable',
# 'cibench_template_cn:numeric_correct',
# 'cibench_template_cn:text_score',
# 'cibench_template_cn:vis_sim',
# # '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
# 'cibench_template_wo_nltk:executable',
# 'cibench_template_wo_nltk:numeric_correct',
# 'cibench_template_wo_nltk:vis_sim',
# # '######## CIBench Template Chinese w/o NLTK ########', # category
# 'cibench_template_cn_wo_nltk:executable',
# 'cibench_template_cn_wo_nltk:numeric_correct',
# 'cibench_template_cn_wo_nltk:vis_sim',
# '######## T-Eval ########', # category
['plugin_eval-p10', 'naive_average'],
['plugin_eval-p10-instruct_v1', 'format_metric'],
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v1', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v1', 'thought'],
['plugin_eval-p10-retrieve_str_v1', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v1', 'name'],
['plugin_eval-p10-understand_str_v1', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-p10-review_str_v1', 'review_quality'],
['plugin_eval-p10_zh', 'naive_average'],
['plugin_eval-p10-instruct_v1_zh', 'format_metric'],
['plugin_eval-p10-instruct_v1_zh', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1_zh', 'f1_score'],
['plugin_eval-p10-plan_json_v1_zh', 'f1_score'],
['plugin_eval-p10-reason_str_v1_zh', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
['plugin_eval-p10-retrieve_str_v1_zh', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'name'],
['plugin_eval-p10-understand_str_v1_zh', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'args'],
['plugin_eval-p10-review_str_v1_zh', 'review_quality'],
# '######## MUS-T-Eval ########', # category
['plugin_eval-mus-p10', 'naive_average'],
['plugin_eval-mus-p10-instruct_v1', 'format_metric'],
['plugin_eval-mus-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-mus-p10-plan_str_v1', 'f1_score'],
['plugin_eval-mus-p10-plan_json_v1', 'f1_score'],
['plugin_eval-mus-p10-reason_str_v1', 'thought'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'thought'],
['plugin_eval-mus-p10-retrieve_str_v1', 'name'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'name'],
['plugin_eval-mus-p10-understand_str_v1', 'args'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-mus-p10-review_str_v1', 'review_quality'],
['plugin_eval-mus-p10_zh', 'naive_average'],
['plugin_eval-mus-p10-instruct_v1_zh', 'format_metric'],
['plugin_eval-mus-p10-instruct_v1_zh', 'args_em_metric'],
['plugin_eval-mus-p10-plan_str_v1_zh', 'f1_score'],
['plugin_eval-mus-p10-plan_json_v1_zh', 'f1_score'],
['plugin_eval-mus-p10-reason_str_v1_zh', 'thought'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
['plugin_eval-mus-p10-retrieve_str_v1_zh', 'name'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'name'],
['plugin_eval-mus-p10-understand_str_v1_zh', 'args'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'args'],
['plugin_eval-mus-p10-review_str_v1_zh', 'review_quality'],
# ['plugin_eval-p10', 'naive_average'],
# ['plugin_eval-mus-p10', 'naive_average'],
# ['plugin_eval-p10_zh', 'naive_average'],
# ['plugin_eval-mus-p10_zh', 'naive_average'],
'agent',
'agent_cn',
'agent_en',
'cibench_template_cn',
'cibench_template',
'plugin_eval-mus-p10_one_review_zh',
'plugin_eval-mus-p10_one_review',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
......
......@@ -21,30 +21,22 @@ code_passk_summary_groups = [
{'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_repeat10', 'pass@10']]},
# real add
{'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']},
{'name': 'code', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humanevalx']}
# {'name': 'code', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humanevalx']}
{'name': 'code_cn', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)']},
{'name': 'code_en', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
{'name': 'code', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
]
summarizer = dict(
dataset_abbrs=[
'code',
'humaneval_pass@1(greedy)',
'humaneval_pass@10',
'code_cn',
'code_en',
'humaneval_cn_pass@1(greedy)',
'humaneval_cn_pass@10',
'humaneval_plus_pass@1(greedy)',
'humaneval_plus_pass@10',
'mbpp_pass@1(greedy)',
'mbpp_pass@10',
'mbpp_cn_pass@1(greedy)',
'mbpp_cn_pass@10',
'sanitized_mbpp_pass@1(greedy)',
'sanitized_mbpp_pass@10',
'humanevalx',
'humanevalx-python',
'humanevalx-cpp',
'humanevalx-go',
'humanevalx-java',
'humanevalx-js',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
......
......@@ -15,21 +15,13 @@ compassbench_v1_knowledge_groups = [
'compassbench_v1_knowledge-mixed-cloze_en'
summarizer = dict(
dataset_abbrs=[
'knowledge_acc_1_and_cloze',
['knowledge_cn', 'acc_1'],
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'acc_1'],
'compassbench_v1_knowledge-mixed-cloze_en',
'knowledge_perf_4_and_cloze',
['knowledge_cn', 'perf_4'],
'compassbench_v1_knowledge-mixed-cloze_en',
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
'compassbench_v1_knowledge-mixed-cloze_en',
],
summary_groups=compassbench_v1_knowledge_groups
)
# This summarizer is used for `./datasets/compassbench_v1_math/compassbench_v1_math_gen`
compassbench_v1_math_groups = [
{'name': 'math_acc_1_and_fill_in_blank', 'subsets': [
['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
['compassbench_v1_math-high-single_choice_en', 'acc_1'],
['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
]},
{'name': 'math_perf_4_and_fill_in_blank', 'subsets': [
['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
['compassbench_v1_math-high-single_choice_en', 'perf_4'],
['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
]},
{'name': 'math_acc_1_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'acc_1'], ['compassbench_v1_math-high-single_choice_en', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
{'name': 'math_perf_4_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
{'name': 'math_perf_4_and_fill_in_blank_cn', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy']]},
{'name': 'math_perf_4_and_fill_in_blank_en', 'subsets': [['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
]
summarizer = dict(
dataset_abbrs=[
'math_acc_1_and_fill_in_blank',
['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
['compassbench_v1_math-high-single_choice_en', 'acc_1'],
['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
'math_perf_4_and_fill_in_blank',
'math_perf_4_and_fill_in_blank_cn',
'math_perf_4_and_fill_in_blank_en',
['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
['compassbench_v1_math-high-single_choice_en', 'perf_4'],
['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
......
......@@ -34,37 +34,18 @@ compassbench_v1_language_groups = [
summarizer = dict(
dataset_abbrs=[
'language_acc_1_and_non_mcq',
'language_en_acc_1_and_non_mcq',
'language_zh_acc_1_and_non_mcq',
# ['information_retrieval_en', 'score'],
# ['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'acc_origin'],
['intention_recognition_zh_circular', 'acc_origin'],
['sentiment_analysis_en_circular', 'acc_origin'],
['sentiment_analysis_zh_circular', 'acc_origin'],
['translation', 'score'],
['content_critic_en_circular', 'acc_origin'],
['content_critic_zh_circular', 'acc_origin'],
['content_summarization_en', 'rouge1'],
['content_summarization_zh', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'acc_origin'],
['chinese_semantic_understanding_zh_circular', 'acc_origin'],
'language_perf_4_and_non_mcq',
'language_en_perf_4_and_non_mcq',
'language_zh_perf_4_and_non_mcq',
# ['information_retrieval_en', 'score'],
# ['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'perf_circular'],
'language_en_perf_4_and_non_mcq',
['intention_recognition_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'perf_circular'],
['intention_recognition_en_circular', 'perf_circular'],
['sentiment_analysis_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'perf_circular'],
['translation', 'score'],
['content_critic_en_circular', 'perf_circular'],
['content_critic_zh_circular', 'perf_circular'],
['content_summarization_en', 'rouge1'],
['content_critic_en_circular', 'perf_circular'],
['content_summarization_zh', 'rouge1'],
['content_summarization_en', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'perf_circular'],
['chinese_semantic_understanding_zh_circular', 'perf_circular'],
],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment