[Sync] Sync with internal codes 2023.01.08 (#777)

32f40a8f · Fengzhe Zhou · GitHub · 8194199d · 32f40a8f · 32f40a8f
Unverified Commit 32f40a8f authored Jan 08, 2024 by Fengzhe Zhou Committed by GitHub Jan 08, 2024
20 changed files
--- a/configs/models/wizardlm/hf_wizardlm_70b_v1_0.py
+++ b/configs/models/wizardlm/hf_wizardlm_70b_v1_0.py
+from opencompass.models import HuggingFaceCausalLM
+
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='USER: ', end=' '),
+        dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='wizardlm-70b-v1.0-hf',
+        path='WizardLM/WizardLM-70B-V1.0',
+        tokenizer_path='WizardLM/WizardLM-70B-V1.0',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4, num_procs=1),
+        end_str='</s>',
+    )
+]
--- a/configs/models/wizardlm/hf_wizardlm_7b_v1_0.py
+++ b/configs/models/wizardlm/hf_wizardlm_7b_v1_0.py
+from opencompass.models import HuggingFaceCausalLM
+
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", end='\n\n'),
+        dict(role="BOT", begin="### Response:", end='</s>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='wizardlm-7b-v1.0-hf',
+        path='WizardLM/WizardLM-7B-V1.0',
+        tokenizer_path='WizardLM/WizardLM-7B-V1.0',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='</s>',
+    )
+]
--- a/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
+++ b/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
+from opencompass.models import VLLM
+
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='USER: ', end=' '),
+        dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='wizardlm-13b-v1.2-vllm',
+        path='WizardLM/WizardLM-13B-V1.2',
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=32,
+        generation_kwargs=dict(temperature=0),
+        end_str='</s>',
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py
+++ b/configs/models/wizardlm/vllm_wizardlm_70b_v1_0.py
+from opencompass.models import VLLM
+
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='USER: ', end=' '),
+        dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='wizardlm-70b-v1.0-vllm',
+        path='WizardLM/WizardLM-70B-V1.0',
+        model_kwargs=dict(tensor_parallel_size=4),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=32,
+        generation_kwargs=dict(temperature=0),
+        end_str='</s>',
+        run_cfg=dict(num_gpus=4, num_procs=1),
+    )
+]
--- a/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py
+++ b/configs/models/wizardlm/vllm_wizardlm_7b_v1_0.py
+from opencompass.models import VLLM
+
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", end='\n\n'),
+        dict(role="BOT", begin="### Response:", end='</s>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='wizardlm-7b-v1.0-vllm',
+        path='WizardLM/WizardLM-7B-V1.0',
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=32,
+        generation_kwargs=dict(temperature=0),
+        end_str='</s>',
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/wizardlm/hf_wizardlm_7b.py
+++ b/configs/models/wizardlm/hf_wizardlm_7b.py
-from opencompass.models import HuggingFaceCausalLM
+from opencompass.models import HuggingFace


 models = [
    dict(
-        type=HuggingFaceCausalLM,
-        abbr='wizardlm-7b-hf',
-        path='TheBloke/wizardLM-7B-HF',
-        tokenizer_path='TheBloke/wizardLM-7B-HF',
+        type=HuggingFace,
+        abbr='yi-34b-200k-hf',
+        path='01-ai/Yi-34B-200K',
+        tokenizer_path='01-ai/Yi-34B-200K',
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
@@ -15,10 +19,6 @@ models = [
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
-        model_kwargs=dict(
-            device_map='auto',
-            trust_remote_code=True,
-        ),
-        run_cfg=dict(num_gpus=1, num_procs=1),
+        run_cfg=dict(num_gpus=4, num_procs=1),
    )
 ]
--- a/configs/models/yi/hf_yi_34b_chat.py
+++ b/configs/models/yi/hf_yi_34b_chat.py
+from opencompass.models import HuggingFace
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
+        dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFace,
+        abbr='yi-34b-chat-hf',
+        path='01-ai/Yi-34B-Chat',
+        tokenizer_path='01-ai/Yi-34B-Chat',
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4, num_procs=1),
+        end_str='<|im_end|>',
+    )
+]
--- a/configs/models/yi/hf_yi_6b_200k.py
+++ b/configs/models/yi/hf_yi_6b_200k.py
+from opencompass.models import HuggingFace
+
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", end='\n\n'),
+        dict(role="BOT", begin="### Response:", end='</s>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFace,
+        abbr='yi-6b-200k-hf',
+        path='01-ai/Yi-6B-200K',
+        tokenizer_path='01-ai/Yi-6B-200K',
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='</s>',
+    )
+]
--- a/configs/models/yi/hf_yi_6b_chat.py
+++ b/configs/models/yi/hf_yi_6b_chat.py
+from opencompass.models import HuggingFace
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
+        dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFace,
+        abbr='yi-6b-chat-hf',
+        path='01-ai/Yi-6B-Chat',
+        tokenizer_path='01-ai/Yi-6B-Chat',
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<|im_end|>',
+    )
+]
--- a/configs/models/zephyr/hf_zephyr_7b_beta.py
+++ b/configs/models/zephyr/hf_zephyr_7b_beta.py
+from opencompass.models import HuggingFace
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<|user|>\n', end='</s>'),
+        dict(role="BOT", begin="<|assistant|>\n", end='</s>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFace,
+        abbr='zephyr-7b-beta-hf',
+        path='HuggingFaceH4/zephyr-7b-beta',
+        tokenizer_path='HuggingFaceH4/zephyr-7b-beta',
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='</s>',
+    )
+]
--- a/configs/models/zephyr/vllm_zephyr_7b_beta.py
+++ b/configs/models/zephyr/vllm_zephyr_7b_beta.py
+from opencompass.models import VLLM
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<|user|>\n', end='</s>'),
+        dict(role="BOT", begin="<|assistant|>\n", end='</s>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=VLLM,
+        abbr='zephyr-7b-beta-vllm',
+        path='HuggingFaceH4/zephyr-7b-beta',
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=32,
+        generation_kwargs=dict(temperature=0),
+        end_str='</s>',
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/summarizers/agent_bench.py
+++ b/configs/summarizers/agent_bench.py
+from mmengine.config import read_base
+
+with read_base():
+    from .groups.cibench import cibench_summary_groups
+    from .groups.plugineval import plugineval_summary_groups
+
+agent_summary_groups = [
+    dict(name='math_acc_1_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
+    dict(name='math_perf_4_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
+    dict(name='math_acc_1_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
+    dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
+    dict(
+        name='agent',
+        subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10'],
+        weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1}
+    )
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        'agent',
+        'math_acc_1_and_fill_in_blank-native',
+        'math_perf_4_and_fill_in_blank-native',
+        # '######## MathBench-Agent Accuracy ########', # category
+        'math_acc_1_and_fill_in_blank-agent',
+        'math_perf_4_and_fill_in_blank-agent',
+        # '######## CIBench Template ########', # category
+        'cibench_template:executable',
+        'cibench_template:numeric_correct',
+        'cibench_template:text_score',
+        'cibench_template:vis_sim',
+        # '######## CIBench Template Chinese ########', # category
+        'cibench_template_cn:executable',
+        'cibench_template_cn:numeric_correct',
+        'cibench_template_cn:text_score',
+        'cibench_template_cn:vis_sim',
+        # '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
+        'cibench_template_wo_nltk:executable',
+        'cibench_template_wo_nltk:numeric_correct',
+        'cibench_template_wo_nltk:vis_sim',
+        # '######## CIBench Template Chinese w/o NLTK ########', # category
+        'cibench_template_cn_wo_nltk:executable',
+        'cibench_template_cn_wo_nltk:numeric_correct',
+        'cibench_template_cn_wo_nltk:vis_sim',
+        # '######## T-Eval ########', # category
+        ['plugin_eval-p10', 'naive_average'],
+        ['plugin_eval-p10-instruct_v1', 'format_metric'],
+        ['plugin_eval-p10-instruct_v1', 'args_em_metric'],
+        ['plugin_eval-p10-plan_str_v1', 'f1_score'],
+        ['plugin_eval-p10-plan_json_v1', 'f1_score'],
+        ['plugin_eval-p10-reason_str_v2', 'thought'],
+        ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
+        ['plugin_eval-p10-retrieve_str_v2', 'name'],
+        ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
+        ['plugin_eval-p10-understand_str_v2', 'args'],
+        ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
+        ['plugin_eval-p10-review_str_v6', 'review_quality'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
+)
--- a/configs/summarizers/cibench.py
+++ b/configs/summarizers/cibench.py
+from mmengine.config import read_base
+
+with read_base():
+    from .groups.cibench import cibench_summary_groups
+
+summarizer = dict(
+    dataset_abbrs=[
+        '######## CIBench Generation ########', # category
+        ['cibench', 'executable'],
+        ['cibench', 'general_correct'],
+        ['cibench', 'vis_sim'],
+        '######## CIBench Template ########', # category
+        'cibench_template:executable',
+        'cibench_template:numeric_correct',
+        'cibench_template:text_score',
+        'cibench_template:vis_sim',
+        '######## CIBench Template Chinese ########', # category
+        'cibench_template_cn:executable',
+        'cibench_template_cn:numeric_correct',
+        'cibench_template_cn:text_score',
+        'cibench_template_cn:vis_sim',
+        '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk 
+        'cibench_template_wo_nltk:executable',
+        'cibench_template_wo_nltk:numeric_correct',
+        'cibench_template_wo_nltk:vis_sim',
+        '######## CIBench Template Chinese w/o NLTK ########', # category
+        'cibench_template_cn_wo_nltk:executable',
+        'cibench_template_cn_wo_nltk:numeric_correct',
+        'cibench_template_cn_wo_nltk:vis_sim',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
+)
--- a/configs/summarizers/code_passk.py
+++ b/configs/summarizers/code_passk.py
+
+code_passk_summary_groups = [
+    # rename
+    {'name': 'humaneval_pass@1(greedy)', 'subsets': [['openai_humaneval', 'humaneval_pass@1']]},
+    {'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_passk', 'humaneval_pass@10']]},
+    {'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_repeat10', 'humaneval_pass@10']]},
+    {'name': 'humaneval_cn_pass@1(greedy)', 'subsets': [['openai_humaneval_cn', 'humaneval_pass@1']]},
+    {'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_passk', 'humaneval_pass@10']]},
+    {'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_repeat10', 'humaneval_pass@10']]},
+    {'name': 'humaneval_plus_pass@1(greedy)', 'subsets': [['humaneval_plus', 'humaneval_plus_pass@1']]},
+    {'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_passk', 'humaneval_plus_pass@10']]},
+    {'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_repeat10', 'humaneval_plus_pass@10']]},
+    {'name': 'mbpp_pass@1(greedy)', 'subsets': [['mbpp', 'score']]},
+    {'name': 'mbpp_pass@10', 'subsets': [['mbpp_passk', 'pass@10']]},
+    {'name': 'mbpp_pass@10', 'subsets': [['mbpp_repeat10', 'pass@10']]},
+    {'name': 'mbpp_cn_pass@1(greedy)', 'subsets': [['mbpp_cn', 'score']]},
+    {'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_passk', 'pass@10']]},
+    {'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_repeat10', 'pass@10']]},
+    {'name': 'sanitized_mbpp_pass@1(greedy)', 'subsets': [['sanitized_mbpp', 'score']]},
+    {'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_passk', 'pass@10']]},
+    {'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_repeat10', 'pass@10']]},
+    # real add
+    {'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']},
+    {'name': 'code', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humanevalx']}
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        'code',
+        'humaneval_pass@1(greedy)',
+        'humaneval_pass@10',
+        'humaneval_cn_pass@1(greedy)',
+        'humaneval_cn_pass@10',
+        'humaneval_plus_pass@1(greedy)',
+        'humaneval_plus_pass@10',
+        'mbpp_pass@1(greedy)',
+        'mbpp_pass@10',
+        'mbpp_cn_pass@1(greedy)',
+        'mbpp_cn_pass@10',
+        'sanitized_mbpp_pass@1(greedy)',
+        'sanitized_mbpp_pass@10',
+        'humanevalx',
+        'humanevalx-python',
+        'humanevalx-cpp',
+        'humanevalx-go',
+        'humanevalx-java',
+        'humanevalx-js',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
+)
--- a/configs/summarizers/compass_knowledge.py
+++ b/configs/summarizers/compass_knowledge.py
+# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
+compassbench_v1_knowledge_names = [
+    'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular',
+    'compassbench_v1_knowledge-engineering-single_choice_cn_circular',
+    'compassbench_v1_knowledge-humanity-single_choice_cn_circular',
+    'compassbench_v1_knowledge-natural_science-single_choice_cn_circular',
+    'compassbench_v1_knowledge-social_science-single_choice_cn_circular',
+]
+
+compassbench_v1_knowledge_groups = [
+    {'name': 'knowledge_cn', 'subsets': compassbench_v1_knowledge_names},
+    {'name': 'knowledge_acc_1_and_cloze', 'subsets': [['knowledge_cn', 'acc_1'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
+    {'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
+]
+
+'compassbench_v1_knowledge-mixed-cloze_en'
+summarizer = dict(
+    dataset_abbrs=[
+        'knowledge_acc_1_and_cloze',
+        ['knowledge_cn', 'acc_1'],
+        ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'acc_1'],
+        ['compassbench_v1_knowledge-engineering-single_choice_cn_circular', 'acc_1'],
+        ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'acc_1'],
+        ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'acc_1'],
+        ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'acc_1'],
+        'compassbench_v1_knowledge-mixed-cloze_en',
+
+        'knowledge_perf_4_and_cloze',
+        ['knowledge_cn', 'perf_4'],
+        ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
+        ['compassbench_v1_knowledge-engineering-single_choice_cn_circular', 'perf_4'],
+        ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
+        ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
+        ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
+        'compassbench_v1_knowledge-mixed-cloze_en',
+    ],
+    summary_groups=compassbench_v1_knowledge_groups
+)
--- a/configs/summarizers/compass_math.py
+++ b/configs/summarizers/compass_math.py
+# This summarizer is used for `./datasets/compassbench_v1_math/compassbench_v1_math_gen`
+
+compassbench_v1_math_groups = [
+    {'name': 'math_acc_1_and_fill_in_blank', 'subsets': [
+        ['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
+        ['compassbench_v1_math-high-single_choice_en', 'acc_1'],
+        ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
+        ['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
+        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
+        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
+    ]},
+    {'name': 'math_perf_4_and_fill_in_blank', 'subsets': [
+        ['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
+        ['compassbench_v1_math-high-single_choice_en', 'perf_4'],
+        ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
+        ['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
+        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
+        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
+    ]},
+]
+
+
+summarizer = dict(
+    dataset_abbrs=[
+        'math_acc_1_and_fill_in_blank',
+        ['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
+        ['compassbench_v1_math-high-single_choice_en', 'acc_1'],
+        ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
+        ['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
+        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
+        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
+
+        'math_perf_4_and_fill_in_blank',
+        ['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
+        ['compassbench_v1_math-high-single_choice_en', 'perf_4'],
+        ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
+        ['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
+        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
+        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
+    ],
+    summary_groups=compassbench_v1_math_groups,
+)
--- a/configs/summarizers/compassbench_v1_language.py
+++ b/configs/summarizers/compassbench_v1_language.py
+compassbench_v1_language_names = [
+    # ['information_retrieval_en', 'score'],
+    # ['information_retrieval_zh', 'score'],
+    ['intention_recognition_en_circular', 'acc_origin'],
+    ['intention_recognition_en_circular', 'perf_circular'],
+    ['intention_recognition_zh_circular', 'acc_origin'],
+    ['intention_recognition_zh_circular', 'perf_circular'],
+    ['sentiment_analysis_en_circular', 'acc_origin'],
+    ['sentiment_analysis_en_circular', 'perf_circular'],
+    ['sentiment_analysis_zh_circular', 'acc_origin'],
+    ['sentiment_analysis_zh_circular', 'perf_circular'],
+    ['translation', 'score'],
+    ['content_critic_en_circular', 'acc_origin'],
+    ['content_critic_en_circular', 'perf_circular'],
+    ['content_critic_zh_circular', 'acc_origin'],
+    ['content_critic_zh_circular', 'perf_circular'],
+    ['content_summarization_en', 'rouge1'],
+    ['content_summarization_zh', 'rouge1'],
+    ['traditional_cultural_understanding_zh_circular', 'acc_origin'],
+    ['traditional_cultural_understanding_zh_circular', 'perf_circular'],
+    ['chinese_semantic_understanding_zh_circular', 'acc_origin'],
+    ['chinese_semantic_understanding_zh_circular', 'perf_circular'],
+]
+
+compassbench_v1_language_groups = [
+    {'name': 'language_zh_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'perf_circular']},
+    {'name': 'language_en_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'perf_circular']},
+    {'name': 'language_acc_1_and_non_mcq', 'subsets': ['language_zh_acc_1_and_non_mcq', 'language_en_acc_1_and_non_mcq']},
+
+    {'name': 'language_zh_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'acc_origin']},
+    {'name': 'language_en_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'acc_origin']},
+    {'name': 'language_perf_4_and_non_mcq', 'subsets': ['language_zh_perf_4_and_non_mcq', 'language_en_perf_4_and_non_mcq']},
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        'language_acc_1_and_non_mcq',
+        'language_en_acc_1_and_non_mcq',
+        'language_zh_acc_1_and_non_mcq',
+        ['information_retrieval_en', 'score'],
+        ['information_retrieval_zh', 'score'],
+        ['intention_recognition_en_circular', 'acc_origin'],
+        ['intention_recognition_zh_circular', 'acc_origin'],
+        ['sentiment_analysis_en_circular', 'acc_origin'],
+        ['sentiment_analysis_zh_circular', 'acc_origin'],
+        ['translation', 'score'],
+        ['content_critic_en_circular', 'acc_origin'],
+        ['content_critic_zh_circular', 'acc_origin'],
+        ['content_summarization_en', 'rouge1'],
+        ['content_summarization_zh', 'rouge1'],
+        ['traditional_cultural_understanding_zh_circular', 'acc_origin'],
+        ['chinese_semantic_understanding_zh_circular', 'acc_origin'],
+
+        'language_perf_4_and_non_mcq',
+        'language_en_perf_4_and_non_mcq',
+        'language_zh_perf_4_and_non_mcq',
+        ['information_retrieval_en', 'score'],
+        ['information_retrieval_zh', 'score'],
+        ['intention_recognition_en_circular', 'perf_circular'],
+        ['intention_recognition_zh_circular', 'perf_circular'],
+        ['sentiment_analysis_en_circular', 'perf_circular'],
+        ['sentiment_analysis_zh_circular', 'perf_circular'],
+        ['translation', 'score'],
+        ['content_critic_en_circular', 'perf_circular'],
+        ['content_critic_zh_circular', 'perf_circular'],
+        ['content_summarization_en', 'rouge1'],
+        ['content_summarization_zh', 'rouge1'],
+        ['traditional_cultural_understanding_zh_circular', 'perf_circular'],
+        ['chinese_semantic_understanding_zh_circular', 'perf_circular'],
+    ],
+    summary_groups=compassbench_v1_language_groups,
+)
--- a/configs/summarizers/compassbench_v1_reason.py
+++ b/configs/summarizers/compassbench_v1_reason.py
+compassbench_v1_reason_groups = [
+    {'name': 'reasonbench_cn_logic_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular', 'reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular', 'reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']},
+    {'name': 'reasonbench_en_logic_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular', 'reasonbench_en_deductive_bbh7obj_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_deductive_ocnli_translated_circular', 'reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']},
+    {'name': 'reasonbench', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_logic_circular', 'reasonbench_en_commonsense_circular', 'reasonbench_en_logic_circular']},
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['reasonbench', 'acc_origin'],
+        ['reasonbench_cn_commonsense_circular', 'acc_origin'],
+        ['reasonbench_en_commonsense_circular', 'acc_origin'],
+        ['reasonbench_cn_logic_circular', 'acc_origin'],
+        ['reasonbench_en_logic_circular', 'acc_origin'],
+        ['reasonbench_cn_abductive_alphanlg_translated_circular', 'acc_origin'],
+        ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'acc_origin'],
+        ['reasonbench_cn_deductive_logiqa_zh_circular', 'acc_origin'],
+        ['reasonbench_cn_inductive_deer_translated_circular', 'acc_origin'],
+        ['reasonbench_cn_inductive_selfgenerated_circular', 'acc_origin'],
+        ['reasonbench_en_abductive_alphanlg_circular', 'acc_origin'],
+        ['reasonbench_en_deductive_bbh7obj_circular', 'acc_origin'],
+        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
+        ['reasonbench_en_deductive_ocnli_translated_circular', 'acc_origin'],
+        ['reasonbench_en_inductive_deer_circular', 'acc_origin'],
+        ['reasonbench_en_inductive_selfgenerated_circular', 'acc_origin'],
+
+        ['reasonbench', 'perf_circular'],
+        ['reasonbench_cn_commonsense_circular', 'perf_circular'],
+        ['reasonbench_en_commonsense_circular', 'perf_circular'],
+        ['reasonbench_cn_logic_circular', 'perf_circular'],
+        ['reasonbench_en_logic_circular', 'perf_circular'],
+        ['reasonbench_cn_abductive_alphanlg_translated_circular', 'perf_circular'],
+        ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'perf_circular'],
+        ['reasonbench_cn_deductive_logiqa_zh_circular', 'perf_circular'],
+        ['reasonbench_cn_inductive_deer_translated_circular', 'perf_circular'],
+        ['reasonbench_cn_inductive_selfgenerated_circular', 'perf_circular'],
+        ['reasonbench_en_abductive_alphanlg_circular', 'perf_circular'],
+        ['reasonbench_en_deductive_bbh7obj_circular', 'perf_circular'],
+        ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
+        ['reasonbench_en_deductive_ocnli_translated_circular', 'perf_circular'],
+        ['reasonbench_en_inductive_deer_circular', 'perf_circular'],
+        ['reasonbench_en_inductive_selfgenerated_circular', 'perf_circular'],
+    ],
+    summary_groups=compassbench_v1_reason_groups,
+)
--- a/configs/summarizers/groups/cibench.py
+++ b/configs/summarizers/groups/cibench.py

 _cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
-_cibench = ['cibench_generation_' + i for i in _cibench]
-cibench_summary_groups = [{'name': 'cibench_generation', 'subsets': _cibench}]
+_cibench = ['cibench_' + i for i in _cibench]
+cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
+
+_cibench_template = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
+    'scipy', 'seaborn', 'sklearn', 'tensorflow']
+_cibench_template = ['cibench_template/' + i for i in _cibench_template]
+# number of total exec questions in this module
+_cibench_template_weight = {
+    'lightgbm': [30, 15, 0, 0],
+    'matplotlib': [42, 0, 0, 36],
+    'nltk': [70, 30, 20, 10],
+    'opencv': [60, 10, 0, 40],
+    'pandas': [60, 40, 0, 10],
+    'pytorch': [28, 0, 0, 0],
+    'scipy': [60, 40, 0, 0],
+    'seaborn': [42, 0, 0, 35],
+    'sklearn': [42, 6, 0, 18],
+    'tensorflow': [36, 6, 0, 12],
+}
+cibench_summary_groups.extend([
+    {
+        'name': 'cibench_template:executable',
+        'subsets': [[i, 'executable'] for i in _cibench_template],
+        'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template:numeric_correct',
+        'subsets': [[i, 'numeric_correct'] for i in _cibench_template],
+        'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template:text_score',
+        'subsets': [[i, 'text_score'] for i in _cibench_template],
+        'weights': {'cibench_template/' + k : v[2] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template:vis_sim',
+        'subsets': [[i, 'vis_sim'] for i in _cibench_template],
+        'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items()},
+    },
+])
+
+
+## chinese
+_cibench_template_cn = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
+    'scipy', 'seaborn', 'sklearn', 'tensorflow']
+_cibench_template_cn = ['cibench_template_chinese/' + i for i in _cibench_template_cn]
+cibench_summary_groups.extend([
+    {
+        'name': 'cibench_template_cn:executable',
+        'subsets': [[i, 'executable'] for i in _cibench_template_cn],
+        'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template_cn:numeric_correct',
+        'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn],
+        'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template_cn:text_score',
+        'subsets': [[i, 'text_score'] for i in _cibench_template_cn],
+        'weights': {'cibench_template_chinese/' + k : v[2] for k,v in _cibench_template_weight.items()},
+    },
+    {
+        'name': 'cibench_template_cn:vis_sim',
+        'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn],
+        'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items()},
+    },
+])
+
+
+## add more without nltk
+cibench_summary_groups.extend([
+    {
+        'name': 'cibench_template_wo_nltk:executable',
+        'subsets': [[i, 'executable'] for i in _cibench_template if 'nltk' not in i],
+        'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+    {
+        'name': 'cibench_template_wo_nltk:numeric_correct',
+        'subsets': [[i, 'numeric_correct'] for i in _cibench_template if 'nltk' not in i],
+        'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+    {
+        'name': 'cibench_template_wo_nltk:vis_sim',
+        'subsets': [[i, 'vis_sim'] for i in _cibench_template if 'nltk' not in i],
+        'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+])
+
+cibench_summary_groups.extend([
+    {
+        'name': 'cibench_template_cn_wo_nltk:executable',
+        'subsets': [[i, 'executable'] for i in _cibench_template_cn if 'nltk' not in i],
+        'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+    {
+        'name': 'cibench_template_cn_wo_nltk:numeric_correct',
+        'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn if 'nltk' not in i],
+        'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+    {
+        'name': 'cibench_template_cn_wo_nltk:vis_sim',
+        'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn if 'nltk' not in i],
+        'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
+    },
+])
--- a/configs/summarizers/groups/plugineval.py
+++ b/configs/summarizers/groups/plugineval.py
@@ -31,4 +31,38 @@ plugineval_summary_groups = [
            ['plugin_eval-review_str_v6', 'review_quality'],
        ]
    },
+
+    # special treatment for first 10% data points
+    {
+        'name': 'plugin_eval-p10-instruct_v1',
+        'metric': 'format_metric',
+        'subsets': [
+            ['plugin_eval-p10-instruct_v1', 'string_format_metric'],
+            ['plugin_eval-p10-instruct_v1', 'json_format_metric'],
+        ]
+    },
+    {
+        'name': 'plugin_eval-p10-instruct_v1',
+        'metric': 'args_em_metric',
+        'subsets': [
+            ['plugin_eval-p10-instruct_v1', 'string_args_em_metric'],
+            ['plugin_eval-p10-instruct_v1', 'json_args_em_metric'],
+        ]
+    },
+    {
+        'name': 'plugin_eval-p10',
+        'subsets': [
+            ['plugin_eval-p10-instruct_v1', 'format_metric'],
+            ['plugin_eval-p10-instruct_v1', 'args_em_metric'],
+            ['plugin_eval-p10-plan_str_v1', 'f1_score'],
+            ['plugin_eval-p10-plan_json_v1', 'f1_score'],
+            ['plugin_eval-p10-reason_str_v2', 'thought'],
+            ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
+            ['plugin_eval-p10-retrieve_str_v2', 'name'],
+            ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
+            ['plugin_eval-p10-understand_str_v2', 'args'],
+            ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
+            ['plugin_eval-p10-review_str_v6', 'review_quality'],
+        ]
+    },
 ]