"configs/vscode:/vscode.git/clone" did not exist on "c1f6bbab5582957d8ddfb487f9df02e81560e93e"
Unverified Commit 32f40a8f authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Sync with internal codes 2023.01.08 (#777)

parent 8194199d
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: ', end=' '),
dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardlm-70b-v1.0-hf',
path='WizardLM/WizardLM-70B-V1.0',
tokenizer_path='WizardLM/WizardLM-70B-V1.0',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
end_str='</s>',
)
]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardlm-7b-v1.0-hf',
path='WizardLM/WizardLM-7B-V1.0',
tokenizer_path='WizardLM/WizardLM-7B-V1.0',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: ', end=' '),
dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='wizardlm-13b-v1.2-vllm',
path='WizardLM/WizardLM-13B-V1.2',
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: ', end=' '),
dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='wizardlm-70b-v1.0-vllm',
path='WizardLM/WizardLM-70B-V1.0',
model_kwargs=dict(tensor_parallel_size=4),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=4, num_procs=1),
)
]
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='wizardlm-7b-v1.0-vllm',
path='WizardLM/WizardLM-7B-V1.0',
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from opencompass.models import HuggingFaceCausalLM
from opencompass.models import HuggingFace
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardlm-7b-hf',
path='TheBloke/wizardLM-7B-HF',
tokenizer_path='TheBloke/wizardLM-7B-HF',
type=HuggingFace,
abbr='yi-34b-200k-hf',
path='01-ai/Yi-34B-200K',
tokenizer_path='01-ai/Yi-34B-200K',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
......@@ -15,10 +19,6 @@ models = [
max_out_len=100,
max_seq_len=2048,
batch_size=8,
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
run_cfg=dict(num_gpus=1, num_procs=1),
run_cfg=dict(num_gpus=4, num_procs=1),
)
]
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='yi-34b-chat-hf',
path='01-ai/Yi-34B-Chat',
tokenizer_path='01-ai/Yi-34B-Chat',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>',
)
]
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='yi-6b-200k-hf',
path='01-ai/Yi-6B-200K',
tokenizer_path='01-ai/Yi-6B-200K',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='yi-6b-chat-hf',
path='01-ai/Yi-6B-Chat',
tokenizer_path='01-ai/Yi-6B-Chat',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
)
]
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|user|>\n', end='</s>'),
dict(role="BOT", begin="<|assistant|>\n", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='zephyr-7b-beta-hf',
path='HuggingFaceH4/zephyr-7b-beta',
tokenizer_path='HuggingFaceH4/zephyr-7b-beta',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|user|>\n', end='</s>'),
dict(role="BOT", begin="<|assistant|>\n", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='zephyr-7b-beta-vllm',
path='HuggingFaceH4/zephyr-7b-beta',
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
from mmengine.config import read_base
with read_base():
from .groups.cibench import cibench_summary_groups
from .groups.plugineval import plugineval_summary_groups
agent_summary_groups = [
dict(name='math_acc_1_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
dict(name='math_perf_4_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
dict(name='math_acc_1_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
dict(
name='agent',
subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10'],
weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1}
)
]
summarizer = dict(
dataset_abbrs=[
'agent',
'math_acc_1_and_fill_in_blank-native',
'math_perf_4_and_fill_in_blank-native',
# '######## MathBench-Agent Accuracy ########', # category
'math_acc_1_and_fill_in_blank-agent',
'math_perf_4_and_fill_in_blank-agent',
# '######## CIBench Template ########', # category
'cibench_template:executable',
'cibench_template:numeric_correct',
'cibench_template:text_score',
'cibench_template:vis_sim',
# '######## CIBench Template Chinese ########', # category
'cibench_template_cn:executable',
'cibench_template_cn:numeric_correct',
'cibench_template_cn:text_score',
'cibench_template_cn:vis_sim',
# '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
'cibench_template_wo_nltk:executable',
'cibench_template_wo_nltk:numeric_correct',
'cibench_template_wo_nltk:vis_sim',
# '######## CIBench Template Chinese w/o NLTK ########', # category
'cibench_template_cn_wo_nltk:executable',
'cibench_template_cn_wo_nltk:numeric_correct',
'cibench_template_cn_wo_nltk:vis_sim',
# '######## T-Eval ########', # category
['plugin_eval-p10', 'naive_average'],
['plugin_eval-p10-instruct_v1', 'format_metric'],
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)
from mmengine.config import read_base
with read_base():
from .groups.cibench import cibench_summary_groups
summarizer = dict(
dataset_abbrs=[
'######## CIBench Generation ########', # category
['cibench', 'executable'],
['cibench', 'general_correct'],
['cibench', 'vis_sim'],
'######## CIBench Template ########', # category
'cibench_template:executable',
'cibench_template:numeric_correct',
'cibench_template:text_score',
'cibench_template:vis_sim',
'######## CIBench Template Chinese ########', # category
'cibench_template_cn:executable',
'cibench_template_cn:numeric_correct',
'cibench_template_cn:text_score',
'cibench_template_cn:vis_sim',
'######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
'cibench_template_wo_nltk:executable',
'cibench_template_wo_nltk:numeric_correct',
'cibench_template_wo_nltk:vis_sim',
'######## CIBench Template Chinese w/o NLTK ########', # category
'cibench_template_cn_wo_nltk:executable',
'cibench_template_cn_wo_nltk:numeric_correct',
'cibench_template_cn_wo_nltk:vis_sim',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)
code_passk_summary_groups = [
# rename
{'name': 'humaneval_pass@1(greedy)', 'subsets': [['openai_humaneval', 'humaneval_pass@1']]},
{'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_passk', 'humaneval_pass@10']]},
{'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_repeat10', 'humaneval_pass@10']]},
{'name': 'humaneval_cn_pass@1(greedy)', 'subsets': [['openai_humaneval_cn', 'humaneval_pass@1']]},
{'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_passk', 'humaneval_pass@10']]},
{'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_repeat10', 'humaneval_pass@10']]},
{'name': 'humaneval_plus_pass@1(greedy)', 'subsets': [['humaneval_plus', 'humaneval_plus_pass@1']]},
{'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_passk', 'humaneval_plus_pass@10']]},
{'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_repeat10', 'humaneval_plus_pass@10']]},
{'name': 'mbpp_pass@1(greedy)', 'subsets': [['mbpp', 'score']]},
{'name': 'mbpp_pass@10', 'subsets': [['mbpp_passk', 'pass@10']]},
{'name': 'mbpp_pass@10', 'subsets': [['mbpp_repeat10', 'pass@10']]},
{'name': 'mbpp_cn_pass@1(greedy)', 'subsets': [['mbpp_cn', 'score']]},
{'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_passk', 'pass@10']]},
{'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_repeat10', 'pass@10']]},
{'name': 'sanitized_mbpp_pass@1(greedy)', 'subsets': [['sanitized_mbpp', 'score']]},
{'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_passk', 'pass@10']]},
{'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_repeat10', 'pass@10']]},
# real add
{'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']},
{'name': 'code', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humanevalx']}
]
summarizer = dict(
dataset_abbrs=[
'code',
'humaneval_pass@1(greedy)',
'humaneval_pass@10',
'humaneval_cn_pass@1(greedy)',
'humaneval_cn_pass@10',
'humaneval_plus_pass@1(greedy)',
'humaneval_plus_pass@10',
'mbpp_pass@1(greedy)',
'mbpp_pass@10',
'mbpp_cn_pass@1(greedy)',
'mbpp_cn_pass@10',
'sanitized_mbpp_pass@1(greedy)',
'sanitized_mbpp_pass@10',
'humanevalx',
'humanevalx-python',
'humanevalx-cpp',
'humanevalx-go',
'humanevalx-java',
'humanevalx-js',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)
# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
compassbench_v1_knowledge_names = [
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular',
'compassbench_v1_knowledge-engineering-single_choice_cn_circular',
'compassbench_v1_knowledge-humanity-single_choice_cn_circular',
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular',
'compassbench_v1_knowledge-social_science-single_choice_cn_circular',
]
compassbench_v1_knowledge_groups = [
{'name': 'knowledge_cn', 'subsets': compassbench_v1_knowledge_names},
{'name': 'knowledge_acc_1_and_cloze', 'subsets': [['knowledge_cn', 'acc_1'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
{'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
]
'compassbench_v1_knowledge-mixed-cloze_en'
summarizer = dict(
dataset_abbrs=[
'knowledge_acc_1_and_cloze',
['knowledge_cn', 'acc_1'],
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-engineering-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'acc_1'],
'compassbench_v1_knowledge-mixed-cloze_en',
'knowledge_perf_4_and_cloze',
['knowledge_cn', 'perf_4'],
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-engineering-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
'compassbench_v1_knowledge-mixed-cloze_en',
],
summary_groups=compassbench_v1_knowledge_groups
)
# This summarizer is used for `./datasets/compassbench_v1_math/compassbench_v1_math_gen`
compassbench_v1_math_groups = [
{'name': 'math_acc_1_and_fill_in_blank', 'subsets': [
['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
['compassbench_v1_math-high-single_choice_en', 'acc_1'],
['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
]},
{'name': 'math_perf_4_and_fill_in_blank', 'subsets': [
['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
['compassbench_v1_math-high-single_choice_en', 'perf_4'],
['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
]},
]
summarizer = dict(
dataset_abbrs=[
'math_acc_1_and_fill_in_blank',
['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
['compassbench_v1_math-high-single_choice_en', 'acc_1'],
['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
'math_perf_4_and_fill_in_blank',
['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
['compassbench_v1_math-high-single_choice_en', 'perf_4'],
['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
],
summary_groups=compassbench_v1_math_groups,
)
compassbench_v1_language_names = [
# ['information_retrieval_en', 'score'],
# ['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'acc_origin'],
['intention_recognition_en_circular', 'perf_circular'],
['intention_recognition_zh_circular', 'acc_origin'],
['intention_recognition_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'acc_origin'],
['sentiment_analysis_en_circular', 'perf_circular'],
['sentiment_analysis_zh_circular', 'acc_origin'],
['sentiment_analysis_zh_circular', 'perf_circular'],
['translation', 'score'],
['content_critic_en_circular', 'acc_origin'],
['content_critic_en_circular', 'perf_circular'],
['content_critic_zh_circular', 'acc_origin'],
['content_critic_zh_circular', 'perf_circular'],
['content_summarization_en', 'rouge1'],
['content_summarization_zh', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'acc_origin'],
['traditional_cultural_understanding_zh_circular', 'perf_circular'],
['chinese_semantic_understanding_zh_circular', 'acc_origin'],
['chinese_semantic_understanding_zh_circular', 'perf_circular'],
]
compassbench_v1_language_groups = [
{'name': 'language_zh_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'perf_circular']},
{'name': 'language_en_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'perf_circular']},
{'name': 'language_acc_1_and_non_mcq', 'subsets': ['language_zh_acc_1_and_non_mcq', 'language_en_acc_1_and_non_mcq']},
{'name': 'language_zh_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'acc_origin']},
{'name': 'language_en_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'acc_origin']},
{'name': 'language_perf_4_and_non_mcq', 'subsets': ['language_zh_perf_4_and_non_mcq', 'language_en_perf_4_and_non_mcq']},
]
summarizer = dict(
dataset_abbrs=[
'language_acc_1_and_non_mcq',
'language_en_acc_1_and_non_mcq',
'language_zh_acc_1_and_non_mcq',
['information_retrieval_en', 'score'],
['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'acc_origin'],
['intention_recognition_zh_circular', 'acc_origin'],
['sentiment_analysis_en_circular', 'acc_origin'],
['sentiment_analysis_zh_circular', 'acc_origin'],
['translation', 'score'],
['content_critic_en_circular', 'acc_origin'],
['content_critic_zh_circular', 'acc_origin'],
['content_summarization_en', 'rouge1'],
['content_summarization_zh', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'acc_origin'],
['chinese_semantic_understanding_zh_circular', 'acc_origin'],
'language_perf_4_and_non_mcq',
'language_en_perf_4_and_non_mcq',
'language_zh_perf_4_and_non_mcq',
['information_retrieval_en', 'score'],
['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'perf_circular'],
['intention_recognition_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'perf_circular'],
['sentiment_analysis_zh_circular', 'perf_circular'],
['translation', 'score'],
['content_critic_en_circular', 'perf_circular'],
['content_critic_zh_circular', 'perf_circular'],
['content_summarization_en', 'rouge1'],
['content_summarization_zh', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'perf_circular'],
['chinese_semantic_understanding_zh_circular', 'perf_circular'],
],
summary_groups=compassbench_v1_language_groups,
)
compassbench_v1_reason_groups = [
{'name': 'reasonbench_cn_logic_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular', 'reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular', 'reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']},
{'name': 'reasonbench_en_logic_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular', 'reasonbench_en_deductive_bbh7obj_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_deductive_ocnli_translated_circular', 'reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']},
{'name': 'reasonbench', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_logic_circular', 'reasonbench_en_commonsense_circular', 'reasonbench_en_logic_circular']},
]
summarizer = dict(
dataset_abbrs=[
['reasonbench', 'acc_origin'],
['reasonbench_cn_commonsense_circular', 'acc_origin'],
['reasonbench_en_commonsense_circular', 'acc_origin'],
['reasonbench_cn_logic_circular', 'acc_origin'],
['reasonbench_en_logic_circular', 'acc_origin'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'acc_origin'],
['reasonbench_cn_inductive_deer_translated_circular', 'acc_origin'],
['reasonbench_cn_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench_en_abductive_alphanlg_circular', 'acc_origin'],
['reasonbench_en_deductive_bbh7obj_circular', 'acc_origin'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
['reasonbench_en_deductive_ocnli_translated_circular', 'acc_origin'],
['reasonbench_en_inductive_deer_circular', 'acc_origin'],
['reasonbench_en_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_en_commonsense_circular', 'perf_circular'],
['reasonbench_cn_logic_circular', 'perf_circular'],
['reasonbench_en_logic_circular', 'perf_circular'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'perf_circular'],
['reasonbench_cn_inductive_deer_translated_circular', 'perf_circular'],
['reasonbench_cn_inductive_selfgenerated_circular', 'perf_circular'],
['reasonbench_en_abductive_alphanlg_circular', 'perf_circular'],
['reasonbench_en_deductive_bbh7obj_circular', 'perf_circular'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_deductive_ocnli_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_deer_circular', 'perf_circular'],
['reasonbench_en_inductive_selfgenerated_circular', 'perf_circular'],
],
summary_groups=compassbench_v1_reason_groups,
)
_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
_cibench = ['cibench_generation_' + i for i in _cibench]
cibench_summary_groups = [{'name': 'cibench_generation', 'subsets': _cibench}]
_cibench = ['cibench_' + i for i in _cibench]
cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
_cibench_template = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
'scipy', 'seaborn', 'sklearn', 'tensorflow']
_cibench_template = ['cibench_template/' + i for i in _cibench_template]
# number of total exec questions in this module
_cibench_template_weight = {
'lightgbm': [30, 15, 0, 0],
'matplotlib': [42, 0, 0, 36],
'nltk': [70, 30, 20, 10],
'opencv': [60, 10, 0, 40],
'pandas': [60, 40, 0, 10],
'pytorch': [28, 0, 0, 0],
'scipy': [60, 40, 0, 0],
'seaborn': [42, 0, 0, 35],
'sklearn': [42, 6, 0, 18],
'tensorflow': [36, 6, 0, 12],
}
cibench_summary_groups.extend([
{
'name': 'cibench_template:executable',
'subsets': [[i, 'executable'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
## chinese
_cibench_template_cn = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
'scipy', 'seaborn', 'sklearn', 'tensorflow']
_cibench_template_cn = ['cibench_template_chinese/' + i for i in _cibench_template_cn]
cibench_summary_groups.extend([
{
'name': 'cibench_template_cn:executable',
'subsets': [[i, 'executable'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
## add more without nltk
cibench_summary_groups.extend([
{
'name': 'cibench_template_wo_nltk:executable',
'subsets': [[i, 'executable'] for i in _cibench_template if 'nltk' not in i],
'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_wo_nltk:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template if 'nltk' not in i],
'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_wo_nltk:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template if 'nltk' not in i],
'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
])
cibench_summary_groups.extend([
{
'name': 'cibench_template_cn_wo_nltk:executable',
'subsets': [[i, 'executable'] for i in _cibench_template_cn if 'nltk' not in i],
'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_cn_wo_nltk:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn if 'nltk' not in i],
'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_cn_wo_nltk:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn if 'nltk' not in i],
'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
])
......@@ -31,4 +31,38 @@ plugineval_summary_groups = [
['plugin_eval-review_str_v6', 'review_quality'],
]
},
# special treatment for first 10% data points
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'format_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_format_metric'],
['plugin_eval-p10-instruct_v1', 'json_format_metric'],
]
},
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'args_em_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_args_em_metric'],
['plugin_eval-p10-instruct_v1', 'json_args_em_metric'],
]
},
{
'name': 'plugin_eval-p10',
'subsets': [
['plugin_eval-p10-instruct_v1', 'format_metric'],
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
]
},
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment