[Sync] Add InternLM2 Keyset Evaluation Demo (#807)

Co-authored-by: zhangyifan1 <zhangyifan1@pjlab.org.cn>

[Sync] Add InternLM2 Keyset Evaluation Demo (#807)
Co-authored-by: zhangyifan1 <zhangyifan1@pjlab.org.cn>
b4afe3e7 · Fengzhe Zhou · GitHub · acae5609 · b4afe3e7 · b4afe3e7
Unverified Commit b4afe3e7 authored Jan 17, 2024 by Fengzhe Zhou Committed by GitHub Jan 17, 2024
20 changed files
--- a/configs/models/others/hf_abel_7b_002.py
+++ b/configs/models/others/hf_abel_7b_002.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='Question:\n', end='\n'),
+        dict(role="BOT", begin="Answer:\n", end='\n', generate=True),
+    ],
+)
+models = [
+    dict(
+        abbr='abel-7b-002',
+        type=HuggingFaceCausalLM,
+        path='GAIR/Abel-7B-002',
+        tokenizer_path='GAIR/Abel-7B-002',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/others/hf_arithmo_mistral_7b.py
+++ b/configs/models/others/hf_arithmo_mistral_7b.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    begin='',
+    round=[
+        dict(role="HUMAN", begin='Question: ', end='\n\n'),
+        dict(role="BOT", begin="Answer: ", end='\n\n', generate=True),
+    ],
+)
+models = [
+    dict(
+        abbr='arithmo-mistral-7b-hf',
+        type=HuggingFaceCausalLM,
+        path='akjindal53244/Arithmo-Mistral-7B',
+        tokenizer_path='akjindal53244/Arithmo-Mistral-7B',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/others/hf_gsm8k_rft_llama7b2_u13b.py
+++ b/configs/models/others/hf_gsm8k_rft_llama7b2_u13b.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    begin='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n',
+    round=[
+        dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
+        dict(role="BOT", begin="### Response:", end='\n\n', generate=True),
+    ],
+)
+models = [
+    dict(
+        abbr='gsm8k-rft-llama7b2-u13b',
+        type=HuggingFaceCausalLM,
+        path='OFA-Sys/gsm8k-rft-llama7b2-u13b',
+        tokenizer_path='OFA-Sys/gsm8k-rft-llama7b2-u13b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/others/hf_metamath_7b_v1_0.py
+++ b/configs/models/others/hf_metamath_7b_v1_0.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    begin="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
+    round=[
+        dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
+        dict(role="BOT", begin="### Response: ", end='\n\n', generate=True),
+    ],
+)
+models = [
+    dict(
+        abbr='metamath-7b-v1.0-hf',
+        type=HuggingFaceCausalLM,
+        path='meta-math/MetaMath-7B-V1.0',
+        tokenizer_path='meta-math/MetaMath-7B-V1.0',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/others/hf_metamath_llemma_7b.py
+++ b/configs/models/others/hf_metamath_llemma_7b.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    begin="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
+    round=[
+        dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
+        dict(role="BOT", begin="### Response: ", end='\n\n', generate=True),
+    ],
+)
+models = [
+    dict(
+        abbr='metamath-llemma-7b-hf',
+        type=HuggingFaceCausalLM,
+        path='meta-math/MetaMath-Llemma-7B',
+        tokenizer_path='meta-math/MetaMath-Llemma-7B',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/others/hf_metamath_mistral_7b.py
+++ b/configs/models/others/hf_metamath_mistral_7b.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    begin="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
+    round=[
+        dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
+        dict(role="BOT", begin="### Response: ", end='\n\n', generate=True),
+    ],
+)
+models = [
+    dict(
+        abbr='metamath-mistral-7b-hf',
+        type=HuggingFaceCausalLM,
+        path='meta-math/MetaMath-Mistral-7B',
+        tokenizer_path='meta-math/MetaMath-Mistral-7B',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/others/hf_phi_2.py
+++ b/configs/models/others/hf_phi_2.py
+from opencompass.models import HuggingFaceCausalLM
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='phi-2-hf',
+        path='microsoft/phi-2',
+        tokenizer_path='microsoft/phi-2',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=100,
+        min_out_len=3,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/others/hf_telechat_7b_chat.py
+++ b/configs/models/others/hf_telechat_7b_chat.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<_user>'),
+        dict(role="BOT", begin="<_bot>", end='<_end>', generate=True),
+    ],
+    eos_token_id=160133
+)
+models = [
+    dict(
+        abbr='telechat-7b-hf',
+        type=HuggingFaceCausalLM,
+        path='Tele-AI/telechat-7B',
+        tokenizer_path='Tele-AI/telechat-7B',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<_end>',
+    )
+]
--- a/configs/models/others/hf_yayi2_30b_base.py
+++ b/configs/models/others/hf_yayi2_30b_base.py
+from opencompass.models import HuggingFaceCausalLM
+models = [
+    dict(
+        abbr='yayi2-30b-hf',
+        type=HuggingFaceCausalLM,
+        path='wenge-research/yayi2-30b',
+        tokenizer_path='wenge-research/yayi2-30b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=100,
+        min_out_len=3,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4, num_procs=1),
+    )
+]
--- a/configs/models/wizardlm/hf_wizardmath_7b_v1_0.py
+++ b/configs/models/wizardlm/hf_wizardmath_7b_v1_0.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", end='\n\n'),
+        dict(role="BOT", begin="### Response:", end='</s>', generate=True),
+    ],
+)
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='wizardmath-7b-v1.0-hf',
+        path='WizardLM/WizardMath-7B-V1.0',
+        tokenizer_path='WizardLM/WizardMath-7B-V1.0',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='</s>',
+    )
+]
--- a/configs/models/wizardlm/hf_wizardmath_7b_v1_1.py
+++ b/configs/models/wizardlm/hf_wizardmath_7b_v1_1.py
+from opencompass.models import HuggingFaceCausalLM
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", end='\n\n'),
+        dict(role="BOT", begin="### Response:", end='</s>', generate=True),
+    ],
+)
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='wizardmath-7b-v1.1-hf',
+        path='WizardLM/WizardMath-7B-V1.1',
+        tokenizer_path='WizardLM/WizardMath-7B-V1.1',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='</s>',
+    )
+]
--- a/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
+++ b/configs/models/wizardlm/vllm_wizardlm_13b_v1_2.py
@@ -16,7 +16,7 @@ models = [
        meta_template=_meta_template,
        max_out_len=100,
        max_seq_len=2048,
-        batch_size=32,
+        batch_size=1,
        generation_kwargs=dict(temperature=0),
        end_str='</s>',
        run_cfg=dict(num_gpus=1, num_procs=1),

--- a/configs/summarizers/agent_bench.py
+++ b/configs/summarizers/agent_bench.py
@@ -11,8 +11,8 @@ agent_summary_groups = [
    dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
    dict(
        name='agent',
-        subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10'],
+        subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10', 'plugin_eval-p10_zh'],
-        weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1}
+        weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1, 'plugin_eval-p10_zh': 1}
    )
 ]
@@ -48,13 +48,26 @@ summarizer = dict(
        ['plugin_eval-p10-instruct_v1', 'args_em_metric'],
        ['plugin_eval-p10-plan_str_v1', 'f1_score'],
        ['plugin_eval-p10-plan_json_v1', 'f1_score'],
-        ['plugin_eval-p10-reason_str_v2', 'thought'],
+        ['plugin_eval-p10-reason_str_v1', 'thought'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
+        ['plugin_eval-p10-reason_retrieve_understand_json_v1', 'thought'],
-        ['plugin_eval-p10-retrieve_str_v2', 'name'],
+        ['plugin_eval-p10-retrieve_str_v1', 'name'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
+        ['plugin_eval-p10-reason_retrieve_understand_json_v1', 'name'],
-        ['plugin_eval-p10-understand_str_v2', 'args'],
+        ['plugin_eval-p10-understand_str_v1', 'args'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
+        ['plugin_eval-p10-reason_retrieve_understand_json_v1', 'args'],
-        ['plugin_eval-p10-review_str_v6', 'review_quality'],
+        ['plugin_eval-p10-review_str_v1', 'review_quality'],
+        ['plugin_eval-p10_zh', 'naive_average'],
+        ['plugin_eval-p10-instruct_v1_zh', 'format_metric'],
+        ['plugin_eval-p10-instruct_v1_zh', 'args_em_metric'],
+        ['plugin_eval-p10-plan_str_v1_zh', 'f1_score'],
+        ['plugin_eval-p10-plan_json_v1_zh', 'f1_score'],
+        ['plugin_eval-p10-reason_str_v1_zh', 'thought'],
+        ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
+        ['plugin_eval-p10-retrieve_str_v1_zh', 'name'],
+        ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'name'],
+        ['plugin_eval-p10-understand_str_v1_zh', 'args'],
+        ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'args'],
+        ['plugin_eval-p10-review_str_v1_zh', 'review_quality'],
    ],
    summary_groups=sum(
        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])

--- a/configs/summarizers/groups/leval.py
+++ b/configs/summarizers/groups/leval.py
+leval_summary_groups = [
+    {"name": "leval", "subsets": ["LEval_coursera", "LEval_gsm100", "LEval_quality", "LEval_tpo", "LEval_topic_retrieval", "LEval_financialqa", "LEval_gov_report_summ", "LEval_legal_contract_qa", "LEval_meeting_summ", "LEval_multidocqa", "LEval_narrativeqa", "LEval_nq", "LEval_news_summ", "LEval_paper_assistant", "LEval_patent_summ", "LEval_review_summ", "LEval_scientificqa", "LEval_tvshow_summ"]},
+]
--- a/configs/summarizers/groups/longbench.py
+++ b/configs/summarizers/groups/longbench.py
+longbench_summary_groups = [
+    {'name': 'longbench_single-document-qa', 'subsets': ['LongBench_narrativeqa', 'LongBench_qasper', 'LongBench_multifieldqa_en', 'LongBench_multifieldqa_zh']},
+    {'name': 'longbench_multi-document-qa', 'subsets': ['LongBench_hotpotqa', 'LongBench_2wikimqa', 'LongBench_musique', 'LongBench_dureader']},
+    {'name': 'longbench_summarization', 'subsets': ['LongBench_gov_report', 'LongBench_qmsum', 'LongBench_multi_news', 'LongBench_vcsum']},
+    {'name': 'longbench_few-shot-learning', 'subsets': ['LongBench_trec', 'LongBench_triviaqa', 'LongBench_samsum', 'LongBench_lsht']},
+    {'name': 'longbench_synthetic-tasks', 'subsets': ['LongBench_passage_count', 'LongBench_passage_retrieval_en', 'LongBench_passage_retrieval_zh']},
+    {'name': 'longbench_code-completion', 'subsets': ['LongBench_lcc', 'LongBench_repobench-p']},
+    {'name': 'longbench_code-completion', 'subsets': ['LongBench_lcc', 'LongBench_repobench-p']},
+    {'name': 'longbench', 'subsets': ['longbench_single-document-qa', 'longbench_multi-document-qa', 'longbench_summarization', 'longbench_few-shot-learning', 'longbench_synthetic-tasks', 'longbench_code-completion', 'longbench_code-completion']},
+]
--- a/configs/summarizers/groups/mathbench.py
+++ b/configs/summarizers/groups/mathbench.py
@@ -66,9 +66,9 @@ naive_mathbench_summary_groups = [
    {
        'name': 'mathbench-circular-and-cloze',
        'subsets': [
+            'mathbench-college-circular',
            'mathbench-high-circular',
            'mathbench-middle-circular',
-            'mathbench-circular',
            'mathbench-college-cloze_en',
            'mathbench-primary-cloze_cn',
        ],

--- a/configs/summarizers/groups/mathbench_agent.py
+++ b/configs/summarizers/groups/mathbench_agent.py
@@ -65,9 +65,9 @@ mathbench_agent_summary_groups = [
    {
        'name': 'mathbench-circular-and-cloze-agent',
        'subsets': [
+            'mathbench-college-circular-agent',
            'mathbench-high-circular-agent',
            'mathbench-middle-circular-agent',
-            'mathbench-circular-agent',
            'mathbench-college-cloze_en-agent',
            'mathbench-primary-cloze_cn-agent',
        ],

--- a/configs/summarizers/groups/plugineval.py
+++ b/configs/summarizers/groups/plugineval.py
-plugineval_summary_groups = [
+from copy import deepcopy
+_base_summary_groups = [
    {
        'name': 'plugin_eval-instruct_v1',
        'metric': 'format_metric',
@@ -22,47 +24,41 @@ plugineval_summary_groups = [
            ['plugin_eval-instruct_v1', 'args_em_metric'],
            ['plugin_eval-plan_str_v1', 'f1_score'],
            ['plugin_eval-plan_json_v1', 'f1_score'],
-            ['plugin_eval-reason_str_v2', 'thought'],
+            ['plugin_eval-reason_str_v1', 'thought'],
-            ['plugin_eval-reason_retrieve_understand_json_v2', 'thought'],
+            ['plugin_eval-reason_retrieve_understand_json_v1', 'thought'],
-            ['plugin_eval-retrieve_str_v2', 'name'],
+            ['plugin_eval-retrieve_str_v1', 'name'],
-            ['plugin_eval-reason_retrieve_understand_json_v2', 'name'],
+            ['plugin_eval-reason_retrieve_understand_json_v1', 'name'],
-            ['plugin_eval-understand_str_v2', 'args'],
+            ['plugin_eval-understand_str_v1', 'args'],
-            ['plugin_eval-reason_retrieve_understand_json_v2', 'args'],
+            ['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
-            ['plugin_eval-review_str_v6', 'review_quality'],
+            ['plugin_eval-review_str_v1', 'review_quality'],
-        ]
-    },
-    # special treatment for first 10% data points
-    {
-        'name': 'plugin_eval-p10-instruct_v1',
-        'metric': 'format_metric',
-        'subsets': [
-            ['plugin_eval-p10-instruct_v1', 'string_format_metric'],
-            ['plugin_eval-p10-instruct_v1', 'json_format_metric'],
-        ]
-    },
-    {
-        'name': 'plugin_eval-p10-instruct_v1',
-        'metric': 'args_em_metric',
-        'subsets': [
-            ['plugin_eval-p10-instruct_v1', 'string_args_em_metric'],
-            ['plugin_eval-p10-instruct_v1', 'json_args_em_metric'],
-        ]
-    },
-    {
-        'name': 'plugin_eval-p10',
-        'subsets': [
-            ['plugin_eval-p10-instruct_v1', 'format_metric'],
-            ['plugin_eval-p10-instruct_v1', 'args_em_metric'],
-            ['plugin_eval-p10-plan_str_v1', 'f1_score'],
-            ['plugin_eval-p10-plan_json_v1', 'f1_score'],
-            ['plugin_eval-p10-reason_str_v2', 'thought'],
-            ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
-            ['plugin_eval-p10-retrieve_str_v2', 'name'],
-            ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
-            ['plugin_eval-p10-understand_str_v2', 'args'],
-            ['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
-            ['plugin_eval-p10-review_str_v6', 'review_quality'],
        ]
    },
 ]
+plugineval_summary_groups = []
+# base
+for group in _base_summary_groups:
+    group = deepcopy(group)
+    plugineval_summary_groups.append(group)
+# base _zh
+for group in _base_summary_groups:
+    group = deepcopy(group)
+    group['name'] = group['name'] + '_zh'
+    group['subsets'] = [[subset[0] + '_zh', subset[1]] for subset in group['subsets']]
+    plugineval_summary_groups.append(group)
+# base -p10-
+for group in _base_summary_groups:
+    group = deepcopy(group)
+    group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-p10')
+    group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-p10'), subset[1]] for subset in group['subsets']]
+    plugineval_summary_groups.append(group)
+# base -p10- _zh
+for group in _base_summary_groups:
+    group = deepcopy(group)
+    group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-p10') + '_zh'
+    group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-p10') + '_zh', subset[1]] for subset in group['subsets']]
+    plugineval_summary_groups.append(group)
--- a/configs/summarizers/internlm2_keyset.py
+++ b/configs/summarizers/internlm2_keyset.py
+from mmengine.config import read_base
+with read_base():
+    from .groups.agieval import agieval_summary_groups
+    from .groups.mmlu import mmlu_summary_groups
+    from .groups.bbh import bbh_summary_groups
+summarizer = dict(
+    dataset_abbrs=[
+        ['mmlu', 'naive_average'],
+        ['agieval', 'naive_average'],
+        ['bbh', 'naive_average'],
+        ['gsm8k', 'accuracy'],
+        ['math', 'accuracy'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['sanitized_mbpp', 'score'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], []),
+)
--- a/configs/summarizers/leval.py
+++ b/configs/summarizers/leval.py
 summarizer = dict(
    dataset_abbrs = [
        '--------- LEval Exact Match (Acc) ---------', # category
-        "LEval_coursera",
+        'LEval_coursera',
        'LEval_gsm100',
        'LEval_quality',
-        "LEval_tpo",
+        'LEval_tpo',
        'LEval_topic_retrieval',
        '--------- LEval Gen (ROUGE) ---------', # category
        'LEval_financialqa',
@@ -21,5 +21,5 @@ summarizer = dict(
        'LEval_scientificqa',
        'LEval_tvshow_summ'
    ],
-    summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )