[Sync] Sync Internal (#941)

b03d5dc5 · Fengzhe Zhou · GitHub · bbec7d87 · b03d5dc5 · b03d5dc5
Unverified Commit b03d5dc5 authored Mar 04, 2024 by Fengzhe Zhou Committed by GitHub Mar 04, 2024
20 changed files
--- a/configs/datasets/nq/nq_open_1shot_gen_01cf41.py
+++ b/configs/datasets/nq/nq_open_1shot_gen_01cf41.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever,  RandomRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import NQOpenDataset, NQEvaluator
+
+nq_datasets = []
+for k in [1]:
+    nq_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        nq_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}?'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                )
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        nq_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}?'),
+                        dict(role='BOT', prompt='A: {answer}.\n'),
+                    ]
+                ),
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin="</E>",
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}?'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                ),
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
+        )
+
+    nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT")
+
+    nq_datasets.append(
+        dict(
+            type=NQOpenDataset,
+            abbr=f'nq_open_{k}shot',
+            path='./data/nq-open/',
+            reader_cfg=nq_reader_cfg,
+            infer_cfg=nq_infer_cfg,
+            eval_cfg=nq_eval_cfg)
+        )
--- a/configs/datasets/nq/nq_open_1shot_gen_20a989.py
+++ b/configs/datasets/nq/nq_open_1shot_gen_20a989.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import NQOpenDataset, NQEvaluator
+
+nq_datasets = []
+for k in [1]:
+    nq_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        nq_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='Q: {question}\nA: ',
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        nq_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template='Q: {question}\nA: {answer}.\n',
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='</E>Q: {question}\nA: ',
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
+        )
+
+    nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT")
+
+    nq_datasets.append(
+        dict(
+            type=NQOpenDataset,
+            abbr=f'nq_open_{k}shot',
+            path='./data/nq-open/',
+            reader_cfg=nq_reader_cfg,
+            infer_cfg=nq_infer_cfg,
+            eval_cfg=nq_eval_cfg)
+        )
--- a/configs/datasets/race/race_ppl_abed12.py
+++ b/configs/datasets/race/race_ppl_abed12.py
@@ -11,19 +11,12 @@ race_reader_cfg = dict(
    test_split="test"
 )

+hint = "Read the article, and answer the question by replying A, B, C or D."
+question_and_options = "{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
 race_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
-        template={
-            'A':
-            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: A',
-            'B':
-            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: B',
-            'C':
-            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: C',
-            'D':
-            'Read the article, and answer the question by replying A, B, C or D.\n\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}\n\nAnswer: D',
-        }),
+        template={answer: hint + '\n\n' + question_and_options + '\n\nAnswer: ' + answer for answer in ['A', 'B', 'C', 'D']}),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=PPLInferencer))


--- a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_20a989.py
+++ b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_20a989.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator
+
+
+triviaqa_datasets = []
+for k in [1]:
+    triviaqa_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        triviaqa_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='Q: {question}\nA: ',
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        triviaqa_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template='Q: {question}\nA: {answer}.\n',
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template='</E>Q: {question}\nA: ',
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
+        )
+
+    triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role="BOT")
+
+    triviaqa_datasets.append(
+    dict(
+        type=TriviaQADataset_V2,
+        abbr=f'triviaqa_wiki_{k}shot',
+        path='./data/triviaqa',
+        reader_cfg=triviaqa_reader_cfg,
+        infer_cfg=triviaqa_infer_cfg,
+        eval_cfg=triviaqa_eval_cfg)
+    )
--- a/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_eaf81e.py
+++ b/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_eaf81e.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator
+
+
+triviaqa_datasets = []
+for k in [1]:
+    triviaqa_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        triviaqa_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                )
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=50)
+        )
+    else:
+        triviaqa_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A: {answer}.\n'),
+                    ]
+                ),
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin="</E>",
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                ),
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
+        )
+
+    triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role="BOT")
+
+    triviaqa_datasets.append(
+    dict(
+        type=TriviaQADataset_V2,
+        abbr=f'triviaqa_wiki_{k}shot',
+        path='./data/triviaqa',
+        reader_cfg=triviaqa_reader_cfg,
+        infer_cfg=triviaqa_infer_cfg,
+        eval_cfg=triviaqa_eval_cfg)
+    )
--- a/configs/datasets/winogrande/winogrande_5shot_gen_6447e6.py
+++ b/configs/datasets/winogrande/winogrande_5shot_gen_6447e6.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import winograndeDataset_V3
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+winogrande_reader_cfg = dict(
+    input_columns=["opt1", "opt2"],
+    output_column="answer",
+    train_split="train_xs",
+    test_split="dev",
+)
+
+winogrande_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin="</E>",
+            round=[
+                dict(role="HUMAN", prompt="Which of the following is a good sentence:\nA. {opt1}\nB. {opt2}\nAnswer:"),
+                dict(role="BOT", prompt="{answer}"),
+            ]
+        ),
+        ice_token="</E>",
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=GenInferencer),
+)
+
+winogrande_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role="BOT",
+    pred_postprocessor=dict(type=first_option_postprocess, options="AB"),
+)
+
+winogrande_datasets = [
+    dict(
+        abbr="winogrande",
+        type=winograndeDataset_V3,
+        path="./data/winogrande",
+        reader_cfg=winogrande_reader_cfg,
+        infer_cfg=winogrande_infer_cfg,
+        eval_cfg=winogrande_eval_cfg,
+    )
+]
--- a/configs/datasets/winogrande/winogrande_5shot_ll_9d81d7.py
+++ b/configs/datasets/winogrande/winogrande_5shot_ll_9d81d7.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import LLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import winograndeDataset_V3
+
+winogrande_reader_cfg = dict(
+    input_columns=['opt1', 'opt2'],
+    output_column='answer',
+    train_split="train_xs",
+    test_split="dev",
+)
+
+question_and_options = "Which of the following is a good sentence:\nA. {opt1}\nB. {opt2}"
+winogrande_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={answer: f"{question_and_options}\nAnswer: {answer}\n" for answer in ["A", "B"]},
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={answer: f"</E>{question_and_options}\nAnswer: {answer}" for answer in ["A", "B"]},
+        ice_token="</E>",
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=LLInferencer),
+)
+winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+winogrande_datasets = [
+    dict(
+        abbr='winogrande',
+        type=winograndeDataset_V3,
+        path='./data/winogrande',
+        reader_cfg=winogrande_reader_cfg,
+        infer_cfg=winogrande_infer_cfg,
+        eval_cfg=winogrande_eval_cfg)
+]
--- a/configs/models/gemma/hf_gemma_2b.py
+++ b/configs/models/gemma/hf_gemma_2b.py
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='gemma-2b-hf',
+        path="google/gemma-2b",
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/gemma/hf_gemma_2b_it.py
+++ b/configs/models/gemma/hf_gemma_2b_it.py
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
+        dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
+    ],
+    eos_token_id=151645,
+)
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='gemma-2b-it-hf',
+        path="google/gemma-2b-it",
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/gemma/hf_gemma_7b.py
+++ b/configs/models/gemma/hf_gemma_7b.py
+from opencompass.models import HuggingFaceCausalLM
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='gemma-7b-hf',
+        path="google/gemma-7b",
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/gemma/hf_gemma_7b_it.py
+++ b/configs/models/gemma/hf_gemma_7b_it.py
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
+        dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
+    ],
+    eos_token_id=151645,
+)
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='gemma-7b-it-hf',
+        path="google/gemma-7b-it",
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        meta_template=_meta_template,
+        min_out_len=1,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/openbmb/hf_minicpm_2b_dpo_fp32.py
+++ b/configs/models/openbmb/hf_minicpm_2b_dpo_fp32.py
+from opencompass.models import HuggingFace
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<用户>'),
+        dict(role="BOT", begin="<AI>", generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFace,
+        abbr='minicpm-2b-dpo-hf',
+        path='openbmb/MiniCPM-2B-dpo-fp32',
+        tokenizer_path='openbmb/MiniCPM-2B-dpo-fp32',
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<用户>',
+    )
+]
--- a/configs/models/openbmb/hf_minicpm_2b_sft_fp32.py
+++ b/configs/models/openbmb/hf_minicpm_2b_sft_fp32.py
+from opencompass.models import HuggingFace
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='<用户>'),
+        dict(role="BOT", begin="<AI>", generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFace,
+        abbr='minicpm-2b-sft-hf',
+        path='openbmb/MiniCPM-2B-sft-fp32',
+        tokenizer_path='openbmb/MiniCPM-2B-sft-fp32',
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        meta_template=_meta_template,
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<用户>',
+    )
+]
--- a/configs/models/qwen/hf_qwen1_5_14b.py
+++ b/configs/models/qwen/hf_qwen1_5_14b.py
@@ -20,6 +20,6 @@ models = [
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
+        run_cfg=dict(num_gpus=2, num_procs=1),
    )
 ]
--- a/configs/models/qwen/vllm_qwen1_5_14b_chat.py
+++ b/configs/models/qwen/vllm_qwen1_5_14b_chat.py
@@ -4,8 +4,7 @@ from opencompass.models import VLLM
 _meta_template = dict(
    round=[
        dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n',
-             generate=True),
+        dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
    ],
    eos_token_id=151645,
 )

--- a/configs/summarizers/agent_bench.py
+++ b/configs/summarizers/agent_bench.py
@@ -5,101 +5,27 @@ with read_base():
    from .groups.plugineval import plugineval_summary_groups

 agent_summary_groups = [
-    dict(name='math_acc_1_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
-    dict(name='math_perf_4_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
-    dict(name='math_acc_1_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
-    dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
-    dict(
-        name='agent',
-        subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10', 'plugin_eval-p10_zh'],
-        weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1, 'plugin_eval-p10_zh': 1}
-    )
+    # dict(name='math_acc_1_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
+    # dict(name='math_perf_4_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
+    # dict(name='math_acc_1_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
+    # dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
+    # dict(name='agent', subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10', 'plugin_eval-p10_zh'], weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1, 'plugin_eval-p10_zh': 1}),
+    dict(name='cibench_template', subsets=['cibench_template:executable', 'cibench_template:numeric_correct', 'cibench_template:text_score', 'cibench_template:vis_sim']),
+    dict(name='cibench_template_cn', subsets=['cibench_template_cn:executable', 'cibench_template_cn:numeric_correct', 'cibench_template_cn:text_score', 'cibench_template_cn:vis_sim']),
+    dict(name='agent_cn', subsets=['cibench_template_cn', 'plugin_eval-mus-p10_one_review_zh']),
+    dict(name='agent_en', subsets=['cibench_template', 'plugin_eval-mus-p10_one_review']),
+    dict(name='agent', subsets=['agent_cn', 'agent_en']),
 ]

 summarizer = dict(
    dataset_abbrs=[
-        # 'agent',
-        # 'math_acc_1_and_fill_in_blank-native',
-        # 'math_perf_4_and_fill_in_blank-native',
-        # # '######## MathBench-Agent Accuracy ########', # category
-        # 'math_acc_1_and_fill_in_blank-agent',
-        # 'math_perf_4_and_fill_in_blank-agent',
-        # # '######## CIBench Template ########', # category
-        # 'cibench_template:executable',
-        # 'cibench_template:numeric_correct',
-        # 'cibench_template:text_score',
-        # 'cibench_template:vis_sim',
-        # # '######## CIBench Template Chinese ########', # category
-        # 'cibench_template_cn:executable',
-        # 'cibench_template_cn:numeric_correct',
-        # 'cibench_template_cn:text_score',
-        # 'cibench_template_cn:vis_sim',
-        # # '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
-        # 'cibench_template_wo_nltk:executable',
-        # 'cibench_template_wo_nltk:numeric_correct',
-        # 'cibench_template_wo_nltk:vis_sim',
-        # # '######## CIBench Template Chinese w/o NLTK ########', # category
-        # 'cibench_template_cn_wo_nltk:executable',
-        # 'cibench_template_cn_wo_nltk:numeric_correct',
-        # 'cibench_template_cn_wo_nltk:vis_sim',
-        # '######## T-Eval ########', # category
-        ['plugin_eval-p10', 'naive_average'],
-        ['plugin_eval-p10-instruct_v1', 'format_metric'],
-        ['plugin_eval-p10-instruct_v1', 'args_em_metric'],
-        ['plugin_eval-p10-plan_str_v1', 'f1_score'],
-        ['plugin_eval-p10-plan_json_v1', 'f1_score'],
-        ['plugin_eval-p10-reason_str_v1', 'thought'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1', 'thought'],
-        ['plugin_eval-p10-retrieve_str_v1', 'name'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1', 'name'],
-        ['plugin_eval-p10-understand_str_v1', 'args'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1', 'args'],
-        ['plugin_eval-p10-review_str_v1', 'review_quality'],
-
-        ['plugin_eval-p10_zh', 'naive_average'],
-        ['plugin_eval-p10-instruct_v1_zh', 'format_metric'],
-        ['plugin_eval-p10-instruct_v1_zh', 'args_em_metric'],
-        ['plugin_eval-p10-plan_str_v1_zh', 'f1_score'],
-        ['plugin_eval-p10-plan_json_v1_zh', 'f1_score'],
-        ['plugin_eval-p10-reason_str_v1_zh', 'thought'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
-        ['plugin_eval-p10-retrieve_str_v1_zh', 'name'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'name'],
-        ['plugin_eval-p10-understand_str_v1_zh', 'args'],
-        ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'args'],
-        ['plugin_eval-p10-review_str_v1_zh', 'review_quality'],
-
-        # '######## MUS-T-Eval ########', # category
-        ['plugin_eval-mus-p10', 'naive_average'],
-        ['plugin_eval-mus-p10-instruct_v1', 'format_metric'],
-        ['plugin_eval-mus-p10-instruct_v1', 'args_em_metric'],
-        ['plugin_eval-mus-p10-plan_str_v1', 'f1_score'],
-        ['plugin_eval-mus-p10-plan_json_v1', 'f1_score'],
-        ['plugin_eval-mus-p10-reason_str_v1', 'thought'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'thought'],
-        ['plugin_eval-mus-p10-retrieve_str_v1', 'name'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'name'],
-        ['plugin_eval-mus-p10-understand_str_v1', 'args'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'args'],
-        ['plugin_eval-mus-p10-review_str_v1', 'review_quality'],
-
-        ['plugin_eval-mus-p10_zh', 'naive_average'],
-        ['plugin_eval-mus-p10-instruct_v1_zh', 'format_metric'],
-        ['plugin_eval-mus-p10-instruct_v1_zh', 'args_em_metric'],
-        ['plugin_eval-mus-p10-plan_str_v1_zh', 'f1_score'],
-        ['plugin_eval-mus-p10-plan_json_v1_zh', 'f1_score'],
-        ['plugin_eval-mus-p10-reason_str_v1_zh', 'thought'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
-        ['plugin_eval-mus-p10-retrieve_str_v1_zh', 'name'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'name'],
-        ['plugin_eval-mus-p10-understand_str_v1_zh', 'args'],
-        ['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'args'],
-        ['plugin_eval-mus-p10-review_str_v1_zh', 'review_quality'],
-
-        # ['plugin_eval-p10', 'naive_average'],
-        # ['plugin_eval-mus-p10', 'naive_average'],
-        # ['plugin_eval-p10_zh', 'naive_average'],
-        # ['plugin_eval-mus-p10_zh', 'naive_average'],
+        'agent',
+        'agent_cn',
+        'agent_en',
+        'cibench_template_cn',
+        'cibench_template',
+        'plugin_eval-mus-p10_one_review_zh',
+        'plugin_eval-mus-p10_one_review',
    ],
    summary_groups=sum(
        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])

--- a/configs/summarizers/code_passk.py
+++ b/configs/summarizers/code_passk.py
@@ -21,30 +21,22 @@ code_passk_summary_groups = [
    {'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_repeat10', 'pass@10']]},
    # real add
    {'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']},
-    {'name': 'code', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humanevalx']}
+    # {'name': 'code', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humanevalx']}
+    {'name': 'code_cn', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)']},
+    {'name': 'code_en', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
+    {'name': 'code', 'subsets': ['humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humanevalx']},
 ]

 summarizer = dict(
    dataset_abbrs=[
        'code',
-        'humaneval_pass@1(greedy)',
-        'humaneval_pass@10',
+        'code_cn',
+        'code_en',
        'humaneval_cn_pass@1(greedy)',
-        'humaneval_cn_pass@10',
        'humaneval_plus_pass@1(greedy)',
-        'humaneval_plus_pass@10',
-        'mbpp_pass@1(greedy)',
-        'mbpp_pass@10',
        'mbpp_cn_pass@1(greedy)',
-        'mbpp_cn_pass@10',
        'sanitized_mbpp_pass@1(greedy)',
-        'sanitized_mbpp_pass@10',
        'humanevalx',
-        'humanevalx-python',
-        'humanevalx-cpp',
-        'humanevalx-go',
-        'humanevalx-java',
-        'humanevalx-js',
    ],
    summary_groups=sum(
        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])

--- a/configs/summarizers/compass_knowledge.py
+++ b/configs/summarizers/compass_knowledge.py
@@ -15,21 +15,13 @@ compassbench_v1_knowledge_groups = [
 'compassbench_v1_knowledge-mixed-cloze_en'
 summarizer = dict(
    dataset_abbrs=[
-        'knowledge_acc_1_and_cloze',
-        ['knowledge_cn', 'acc_1'],
-        ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'acc_1'],
-        ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'acc_1'],
-        ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'acc_1'],
-        ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'acc_1'],
-        'compassbench_v1_knowledge-mixed-cloze_en',
-
        'knowledge_perf_4_and_cloze',
        ['knowledge_cn', 'perf_4'],
+        'compassbench_v1_knowledge-mixed-cloze_en',
        ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
        ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
        ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
        ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
-        'compassbench_v1_knowledge-mixed-cloze_en',
    ],
    summary_groups=compassbench_v1_knowledge_groups
 )
--- a/configs/summarizers/compass_math.py
+++ b/configs/summarizers/compass_math.py
 # This summarizer is used for `./datasets/compassbench_v1_math/compassbench_v1_math_gen`

 compassbench_v1_math_groups = [
-    {'name': 'math_acc_1_and_fill_in_blank', 'subsets': [
-        ['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
-        ['compassbench_v1_math-high-single_choice_en', 'acc_1'],
-        ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
-        ['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
-        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
-        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
-    ]},
-    {'name': 'math_perf_4_and_fill_in_blank', 'subsets': [
-        ['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
-        ['compassbench_v1_math-high-single_choice_en', 'perf_4'],
-        ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
-        ['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
-        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
-        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
-    ]},
+    {'name': 'math_acc_1_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'acc_1'], ['compassbench_v1_math-high-single_choice_en', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
+    {'name': 'math_perf_4_and_fill_in_blank', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
+    {'name': 'math_perf_4_and_fill_in_blank_cn', 'subsets': [['compassbench_v1_math-high-single_choice_cn', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn', 'accuracy']]},
+    {'name': 'math_perf_4_and_fill_in_blank_en', 'subsets': [['compassbench_v1_math-high-single_choice_en', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en', 'perf_4'], ['compassbench_v1_math-primary-cloze_en', 'accuracy']]},
 ]


 summarizer = dict(
    dataset_abbrs=[
-        'math_acc_1_and_fill_in_blank',
-        ['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
-        ['compassbench_v1_math-high-single_choice_en', 'acc_1'],
-        ['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
-        ['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
-        ['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
-        ['compassbench_v1_math-primary-cloze_en', 'accuracy'],
-
        'math_perf_4_and_fill_in_blank',
+        'math_perf_4_and_fill_in_blank_cn',
+        'math_perf_4_and_fill_in_blank_en',
        ['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
        ['compassbench_v1_math-high-single_choice_en', 'perf_4'],
        ['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],

--- a/configs/summarizers/compassbench_v1_language.py
+++ b/configs/summarizers/compassbench_v1_language.py
@@ -34,37 +34,18 @@ compassbench_v1_language_groups = [

 summarizer = dict(
    dataset_abbrs=[
-        'language_acc_1_and_non_mcq',
-        'language_en_acc_1_and_non_mcq',
-        'language_zh_acc_1_and_non_mcq',
-        # ['information_retrieval_en', 'score'],
-        # ['information_retrieval_zh', 'score'],
-        ['intention_recognition_en_circular', 'acc_origin'],
-        ['intention_recognition_zh_circular', 'acc_origin'],
-        ['sentiment_analysis_en_circular', 'acc_origin'],
-        ['sentiment_analysis_zh_circular', 'acc_origin'],
-        ['translation', 'score'],
-        ['content_critic_en_circular', 'acc_origin'],
-        ['content_critic_zh_circular', 'acc_origin'],
-        ['content_summarization_en', 'rouge1'],
-        ['content_summarization_zh', 'rouge1'],
-        ['traditional_cultural_understanding_zh_circular', 'acc_origin'],
-        ['chinese_semantic_understanding_zh_circular', 'acc_origin'],
-
        'language_perf_4_and_non_mcq',
-        'language_en_perf_4_and_non_mcq',
        'language_zh_perf_4_and_non_mcq',
-        # ['information_retrieval_en', 'score'],
-        # ['information_retrieval_zh', 'score'],
-        ['intention_recognition_en_circular', 'perf_circular'],
+        'language_en_perf_4_and_non_mcq',
        ['intention_recognition_zh_circular', 'perf_circular'],
-        ['sentiment_analysis_en_circular', 'perf_circular'],
+        ['intention_recognition_en_circular', 'perf_circular'],
        ['sentiment_analysis_zh_circular', 'perf_circular'],
+        ['sentiment_analysis_en_circular', 'perf_circular'],
        ['translation', 'score'],
-        ['content_critic_en_circular', 'perf_circular'],
        ['content_critic_zh_circular', 'perf_circular'],
-        ['content_summarization_en', 'rouge1'],
+        ['content_critic_en_circular', 'perf_circular'],
        ['content_summarization_zh', 'rouge1'],
+        ['content_summarization_en', 'rouge1'],
        ['traditional_cultural_understanding_zh_circular', 'perf_circular'],
        ['chinese_semantic_understanding_zh_circular', 'perf_circular'],
    ],