[Sync] update taco (#1030)

b39f5015 · Fengzhe Zhou · GitHub · 16f29b25 · b39f5015 · b39f5015
Unverified Commit b39f5015 authored Apr 09, 2024 by Fengzhe Zhou Committed by GitHub Apr 09, 2024
20 changed files
--- a/.github/scripts/pr_oc_score_assert.py
+++ b/.github/scripts/pr_oc_score_assert.py
+import csv
+import os
+
+import pytest
+
+output_path = 'regression_result'
+model = 'internlm-chat-7b-hf'
+dataset = 'siqa'
+
+
+@pytest.fixture()
+def result_scores():
+    file = find_csv_files(output_path)
+    if file is None:
+        return None
+    return read_csv_file(file)
+
+
+@pytest.mark.usefixtures('result_scores')
+class TestChatScore:
+    """Test cases for chat model."""
+
+    def test_model_dataset_score(self, result_scores):
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 73.59)
+
+
+def assert_score(score, baseline):
+    if score is None or score == '-':
+        assert False, 'value is none'
+    if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
+        print(score + ' between ' + str(baseline * 0.97) + ' and ' +
+              str(baseline * 1.03))
+        assert True
+    else:
+        assert False, score + ' not between ' + str(
+            baseline * 0.97) + ' and ' + str(baseline * 1.03)
+
+
+def find_csv_files(directory):
+    csv_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.csv'):
+                csv_files.append(os.path.join(root, file))
+    if len(csv_files) > 1:
+        raise 'have more than 1 result file, please check the result manually'
+    if len(csv_files) == 0:
+        return None
+    return csv_files[0]
+
+
+def read_csv_file(file_path):
+    with open(file_path, 'r') as csvfile:
+        reader = csv.DictReader(csvfile)
+        filtered_data = []
+
+        for row in reader:
+            filtered_row = {
+                k: v
+                for k, v in row.items()
+                if k not in ['version', 'metric', 'mode']
+            }
+            filtered_data.append(filtered_row)
+
+    result = {}
+    for data in filtered_data:
+        dataset = data.get('dataset')
+        for key in data.keys():
+            if key == 'dataset':
+                continue
+            else:
+                if key in result.keys():
+                    result.get(key)[dataset] = data.get(key)
+                else:
+                    result[key] = {dataset: data.get(key)}
+    return result
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@ configs/eval_debug*.py
 configs/viz_*.py
 data
 work_dirs
+outputs
 models/*
 configs/internal/
 # Byte-compiled / optimized / DLL files
@@ -121,6 +122,7 @@ turbomind/
 *.txt
 *.jpg
 *.json
+*.jsonl
 *.csv
 *.npy
 *.c

--- a/configs/datasets/MathBench/mathbench_2024_gen_de9ff9.py
+++ b/configs/datasets/MathBench/mathbench_2024_gen_de9ff9.py
--- a/configs/datasets/MathBench/mathbench_2024_gen_649349.py
+++ b/configs/datasets/MathBench/mathbench_2024_gen_649349.py
@@ -60,7 +60,7 @@ mathbench_sets = {
    'high': ['single_choice_cn', 'single_choice_en'],
    'middle': ['single_choice_cn', 'single_choice_en'],
    'primary': ['cloze_cn', 'cloze_en'],
-    'calculate': ['cloze_en'],
+    'arithmetic': ['cloze_en'],
    # Theory part
    'college_knowledge': ['single_choice_cn','single_choice_en'],
    'high_knowledge': ['single_choice_cn','single_choice_en'],
@@ -102,7 +102,7 @@ for _split in list(mathbench_sets.keys()):
            dict(
                abbr="mathbench-" + _split + '-' + _name,
                type=MathBenchDataset,
-                path=f"./data/mathbench_v1_ori/{_split}",
+                path=f"./data/mathbench_v1/{_split}",
                name=_name,
                with_circular=with_circular_eval,
                reader_cfg=dict(

--- a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_649349.py
+++ b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_649349.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, mathbench_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+single_choice_prompts = {
+    "single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题，请你一步一步推理，并在最后用“所以答案为选项X”给出答案，其中“X”为选项A，B，C，D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考：\n",
+    "single_choice_cn": "以下是一道关于数学的单项选择题，请你直接回答正确答案的选项序号。\n下面是你要回答的题目：\n{question}\n答案选项：\n",
+    "single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A，B，C，D. Here is the question you need to answer:\n{question}\nLet's think step by step:\n",
+    "single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:\n",
+}
+
+cloze_prompts = {
+    "cloze_cn": [
+        "Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后，将有21棵树。林务工人员今天种植了多少棵树？\nA: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以，他们必须种植了21 - 15 = 6棵树。答案是 6",
+        "Q: 如果停车场有3辆车，又有2辆车进来，停车场里有多少辆车？\nA: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5",
+        "Q: 黎恩有32块巧克力，她的妹妹有42块。如果他们吃了35块，他们总共剩下多少块？\nA: 黎恩有32块巧克力，Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39",
+        "Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖？\nA: 杰森有20个棒棒糖。因为他现在只剩下12个，所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8",
+        "Q: 莎莎有五个玩具。在圣诞节，他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具？\nA: 她有5个玩具。他从妈妈那里得到了2个，所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个，所以总共他有7 + 2 = 9个玩具。答案是 9",
+        "Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑？\nA: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑，所以现在有9 + 20 = 29台电脑。答案是 29",
+        "Q: 迈克尔有58个高尔夫球。星期二，他丢失了23个高尔夫球。星期三，他又丢失了2个。星期三结束时他还剩下多少个高尔夫球？\nA: 迈克尔一开始有58个球。星期二他丢失了23个，所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个，所以现在他还剩下35 - 2 = 33个球。答案是 33",
+        "Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱？\nA: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元，所以现在她还剩下23 - 15 = 8美元。答案是 8",
+        "Q: {question}\nA: {answer}",
+    ],
+    "cloze_en": [
+        "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.",
+        "Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.",
+        "Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.",
+        "Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.",
+        "Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.",
+        "Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.",
+        "Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.",
+        "Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.",
+        "Q: {question}\nA: {answer}",
+    ],
+}
+
+mathbench_sets = {
+    # Practice Part
+    "college": ["single_choice_cn", "single_choice_en"],
+    "high": ["single_choice_cn", "single_choice_en"],
+    "middle": ["single_choice_cn", "single_choice_en"],
+    "primary": ["cloze_cn", "cloze_en"],
+    "calculate": ["cloze_en"],
+    # Theory part
+    "college_knowledge": ["single_choice_cn", "single_choice_en"],
+    "high_knowledge": ["single_choice_cn", "single_choice_en"],
+    "middle_knowledge": ["single_choice_cn", "single_choice_en"],
+    "primary_knowledge": ["single_choice_cn", "single_choice_en"],
+}
+
+# Generate reasoning path or not, only for single choice
+with_reasoning = False
+
+# Use circular evaluation or not
+with_circular_eval = False
+
+mathbench_datasets = []
+
+for _split in list(mathbench_sets.keys()):
+    for _name in mathbench_sets[_split]:
+        mathbench_reader_cfg = dict(
+            input_columns=["question"],
+            output_column="answer",
+        )
+
+        if "single_choice" in _name:
+            if with_reasoning:
+                mathbench_infer_cfg = dict(
+                    prompt_template=dict(type=PromptTemplate, template=single_choice_prompts[_name + "_with_reasoning"]),
+                    retriever=dict(type=ZeroRetriever),
+                    inferencer=dict(type=GenInferencer, max_out_len=512),
+                )
+            else:
+                mathbench_infer_cfg = dict(
+                    prompt_template=dict(type=PromptTemplate, template={answer: f"{single_choice_prompts[_name]}{answer}" for answer in ['A', 'B', 'C', 'D']}),
+                    retriever=dict(type=ZeroRetriever),
+                    inferencer=dict(type=PPLInferencer),
+                )
+        else:
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template='\n'.join(cloze_prompts[_name])),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=GenInferencer, max_out_len=512),
+            )
+
+
+        if "single_choice" in _name:
+            pred_postprocessor = dict(type=first_option_postprocess, options="ABCD")
+        else:
+            pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
+        if "single_choice" in _name and with_circular_eval:
+            evaluator = dict(type=CircularEvaluator)
+        else:
+            evaluator = dict(type=AccEvaluator)
+        mathbench_eval_cfg = dict(
+            evaluator=evaluator,
+            pred_postprocessor=pred_postprocessor,
+        )
+
+        mathbench_datasets.append(
+            dict(
+                abbr="mathbench-" + _split + "-" + _name,
+                type=MathBenchDataset,
+                path=f"./data/mathbench_v1_ori/{_split}",
+                name=_name,
+                with_circular=with_circular_eval,
+                reader_cfg=mathbench_reader_cfg,
+                infer_cfg=mathbench_infer_cfg,
+                eval_cfg=mathbench_eval_cfg,
+            )
+        )
--- a/configs/datasets/MathBench/mathbench_gen.py
+++ b/configs/datasets/MathBench/mathbench_gen.py
 from mmengine.config import read_base

 with read_base():
-    from .mathbench_2024_gen_de9ff9 import mathbench_datasets  # noqa: F401, F403
+    from .mathbench_2024_gen_649349 import mathbench_datasets  # noqa: F401, F403
--- a/configs/datasets/apps/apps_mini_gen_c7893a.py
+++ b/configs/datasets/apps/apps_mini_gen_c7893a.py
@@ -19,7 +19,7 @@ APPS_mini_datasets = [
    dict(
        type=APPS_miniDataset,
        abbr="apps_mini",
-        path="codeparrot_mini/apps",
+        path="./data/apps_mini",
        num_repeats=1,
        reader_cfg=APPS_reader_cfg,
        infer_cfg=APPS_infer_cfg,

--- a/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py
+++ b/configs/datasets/ceval/ceval_internal_ppl_1cd8bf.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CEvalDataset
+
+ceval_subject_mapping = {
+    'computer_network': ['Computer Network', '计算机网络', 'STEM'],
+    'operating_system': ['Operating System', '操作系统', 'STEM'],
+    'computer_architecture': ['Computer Architecture', '计算机组成', 'STEM'],
+    'college_programming': ['College Programming', '大学编程', 'STEM'],
+    'college_physics': ['College Physics', '大学物理', 'STEM'],
+    'college_chemistry': ['College Chemistry', '大学化学', 'STEM'],
+    'advanced_mathematics': ['Advanced Mathematics', '高等数学', 'STEM'],
+    'probability_and_statistics': ['Probability and Statistics', '概率统计', 'STEM'],
+    'discrete_mathematics': ['Discrete Mathematics', '离散数学', 'STEM'],
+    'electrical_engineer': ['Electrical Engineer', '注册电气工程师', 'STEM'],
+    'metrology_engineer': ['Metrology Engineer', '注册计量师', 'STEM'],
+    'high_school_mathematics': ['High School Mathematics', '高中数学', 'STEM'],
+    'high_school_physics': ['High School Physics', '高中物理', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', '高中化学', 'STEM'],
+    'high_school_biology': ['High School Biology', '高中生物', 'STEM'],
+    'middle_school_mathematics': ['Middle School Mathematics', '初中数学', 'STEM'],
+    'middle_school_biology': ['Middle School Biology', '初中生物', 'STEM'],
+    'middle_school_physics': ['Middle School Physics', '初中物理', 'STEM'],
+    'middle_school_chemistry': ['Middle School Chemistry', '初中化学', 'STEM'],
+    'veterinary_medicine': ['Veterinary Medicine', '兽医学', 'STEM'],
+    'college_economics': ['College Economics', '大学经济学', 'Social Science'],
+    'business_administration': ['Business Administration', '工商管理', 'Social Science'],
+    'marxism': ['Marxism', '马克思主义基本原理', 'Social Science'],
+    'mao_zedong_thought': ['Mao Zedong Thought', '毛泽东思想和中国特色社会主义理论体系概论', 'Social Science'],
+    'education_science': ['Education Science', '教育学', 'Social Science'],
+    'teacher_qualification': ['Teacher Qualification', '教师资格', 'Social Science'],
+    'high_school_politics': ['High School Politics', '高中政治', 'Social Science'],
+    'high_school_geography': ['High School Geography', '高中地理', 'Social Science'],
+    'middle_school_politics': ['Middle School Politics', '初中政治', 'Social Science'],
+    'middle_school_geography': ['Middle School Geography', '初中地理', 'Social Science'],
+    'modern_chinese_history': ['Modern Chinese History', '近代史纲要', 'Humanities'],
+    'ideological_and_moral_cultivation': ['Ideological and Moral Cultivation', '思想道德修养与法律基础', 'Humanities'],
+    'logic': ['Logic', '逻辑学', 'Humanities'],
+    'law': ['Law', '法学', 'Humanities'],
+    'chinese_language_and_literature': ['Chinese Language and Literature', '中国语言文学', 'Humanities'],
+    'art_studies': ['Art Studies', '艺术学', 'Humanities'],
+    'professional_tour_guide': ['Professional Tour Guide', '导游资格', 'Humanities'],
+    'legal_professional': ['Legal Professional', '法律职业资格', 'Humanities'],
+    'high_school_chinese': ['High School Chinese', '高中语文', 'Humanities'],
+    'high_school_history': ['High School History', '高中历史', 'Humanities'],
+    'middle_school_history': ['Middle School History', '初中历史', 'Humanities'],
+    'civil_servant': ['Civil Servant', '公务员', 'Other'],
+    'sports_science': ['Sports Science', '体育学', 'Other'],
+    'plant_protection': ['Plant Protection', '植物保护', 'Other'],
+    'basic_medicine': ['Basic Medicine', '基础医学', 'Other'],
+    'clinical_medicine': ['Clinical Medicine', '临床医学', 'Other'],
+    'urban_and_rural_planner': ['Urban and Rural Planner', '注册城乡规划师', 'Other'],
+    'accountant': ['Accountant', '注册会计师', 'Other'],
+    'fire_engineer': ['Fire Engineer', '注册消防工程师', 'Other'],
+    'environmental_impact_assessment_engineer': ['Environmental Impact Assessment Engineer', '环境影响评价工程师', 'Other'],
+    'tax_accountant': ['Tax Accountant', '税务师', 'Other'],
+    'physician': ['Physician', '医师资格', 'Other'],
+}
+ceval_all_sets = list(ceval_subject_mapping.keys())
+
+ceval_datasets = []
+for _split in ["val", "test"]:
+    for _name in ceval_all_sets:
+        ceval_reader_cfg = dict(
+            input_columns=["question", "A", "B", "C", "D"],
+            output_column="answer",
+            train_split="dev",
+            test_split=_split,
+        )
+
+        _ch_name = ceval_subject_mapping[_name][1]
+
+        hint = f"以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。"
+        question_and_options = "{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}"
+        ceval_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template={answer: f"{question_and_options}\n答案: {answer}\n" for answer in ["A", "B", "C", "D"]},
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template={answer: f"{hint}\n</E>{question_and_options}\n答案: {answer}" for answer in ["A", "B", "C", "D"]},
+                ice_token="</E>",
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+            inferencer=dict(type=PPLInferencer),
+        )
+
+        ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+        ceval_datasets.append(
+            dict(
+                type=CEvalDataset,
+                path="./data/ceval_internal/formal_ceval",
+                name=_name,
+                abbr="ceval-" + _name if _split == "val" else "ceval-test-" + _name,
+                reader_cfg=ceval_reader_cfg,
+                infer_cfg=ceval_infer_cfg,
+                eval_cfg=ceval_eval_cfg,
+            ))
--- a/configs/datasets/collections/chat_core.py
+++ b/configs/datasets/collections/chat_core.py
@@ -12,7 +12,7 @@ with read_base():
    from ..hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
    from ..bbh.bbh_gen_5b92b0 import bbh_datasets
    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from ..math.math_evaluatorv2_gen_265cce import math_datasets
+    from ..math.math_evaluatorv2_gen_cecb31 import math_datasets
    from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
    from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
    from ..mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets

--- a/configs/datasets/gsm8k/gsm8k_0shot_gen_a58960.py
+++ b/configs/datasets/gsm8k/gsm8k_0shot_gen_a58960.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+
+gsm8k_reader_cfg = dict(input_columns=["question"], output_column="answer")
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role="HUMAN", prompt="{question}\nPlease reason step by step, and put your final answer within \\boxed{}."),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+gsm8k_eval_cfg = dict(
+    evaluator=dict(type=Gsm8kEvaluator),
+    pred_postprocessor=dict(type=gsm8k_postprocess),
+    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
+)
+
+gsm8k_datasets = [
+    dict(
+        abbr="gsm8k",
+        type=GSM8KDataset,
+        path="./data/gsm8k",
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg,
+    )
+]
--- a/configs/datasets/math/math_evaluatorv2_gen_265cce.py
+++ b/configs/datasets/math/math_evaluatorv2_gen_265cce.py
--- a/configs/datasets/math/math_0shot_gen_393424.py
+++ b/configs/datasets/math/math_0shot_gen_393424.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
+
+math_reader_cfg = dict(input_columns=["problem"], output_column="solution")
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role="HUMAN", prompt="{problem}\nPlease reason step by step, and put your final answer within \\boxed{}."),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version="v2"), pred_postprocessor=dict(type=math_postprocess_v2),
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr="math",
+        path="./data/math/math.json",
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
--- a/configs/datasets/math/math_evaluatorv2_gen_cecb31.py
+++ b/configs/datasets/math/math_evaluatorv2_gen_cecb31.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role="HUMAN", prompt="Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"),
+            dict(role="BOT", prompt="The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"),
+            dict(role="BOT", prompt="We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"),
+            dict(role="BOT", prompt="If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"),
+            dict(role="BOT", prompt="If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"),
+            dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024))
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'),
+    pred_postprocessor=dict(type=math_postprocess_v2))
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='./data/math/math.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
--- a/configs/datasets/subjective/compassarena/compassarena_compare.py
+++ b/configs/datasets/subjective/compassarena/compassarena_compare.py
@@ -91,7 +91,7 @@ reason_prompt = math_prompt
 creation_prompt = """
 请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
 评分要求（重要性依次递减）:
-1. 好的回答必须首先符合用户问题里的各种需求，不能跑题 
+1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
 2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
 3. 好的回答必须具有创造性的词语和表达丰富度

@@ -99,7 +99,7 @@ creation_prompt = """
 {question}
 """ + base_prompt

-sub_map = {"knowledge": knowledge_prompt, "language": language_prompt, "math_v2": math_prompt, "reason_v2": reason_prompt, "creationv2_zh": creation_prompt}
+sub_map = {"language": language_prompt, "knowledge": knowledge_prompt, "reason_v2": reason_prompt, "math_v2": math_prompt, "creationv2_zh": creation_prompt}

 for _name, _prompt in sub_map.items():
    subjective_infer_cfg = dict(

--- a/configs/datasets/subjective/compassarena/compassarena_compare_creationv3.py
+++ b/configs/datasets/subjective/compassarena/compassarena_compare_creationv3.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassArenaDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'ref'],
+    output_column='judge',
+    )
+
+data_path ="data/subjective/compass_arena"
+
+subjective_datasets = []
+
+base_prompt = """
+
+[回答1开始]
+{prediction}
+[回答1结束]
+
+[回答2开始]
+{prediction2}
+[回答2结束]
+
+根据评分要求，在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+并提供你的解释原因。
+
+如果你认为回答1更好，你的输出应形如：
+选择：A
+原因：blahblah blahblah\n
+
+如果你认为回答2更好，你的输出应形如：
+选择：B
+原因：blahblah blahblah\n
+
+如果你认为回答1、2打成平手，你的输出应形如：
+选择：C
+原因：blahblah blahblah\n
+"""
+
+knowledge_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答能与参考答案吻合或表明参考答案的意思。
+2. 在都准确答对问题的前提下，更好的回答能对知识点进行额外补充，且补充的知识准确无误。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+
+[用户问题]
+{question}
+
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+language_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 在有明确的参考答案的情况下，越贴近参考答案或表明了参考答案的意思的回答越好。
+2. 更好的回答在语言表达上更流畅，更加符合与人类对话的习惯，包括语气、情调等
+3. 在都准确答对问题的前提下，更好的回答能进行额外补充，且补充的内容准确无误。
+
+[用户问题]
+{question}
+
+[参考答案]
+{ref}
+""" + base_prompt
+
+
+math_prompt = """
+请根据提供的 评分要求，用户问题，参考答案 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 更好的回答的答案能和参考答案一致。
+2. 若两个回答的答案都与参考答案不一致，则更好的回答的推理过程应更加合理。
+3. 更好的回答更加符合与人类对话的习惯，包括语气、情调等。
+
+[用户问题]
+{question}
+
+[参考答案]
+{ref}
+""" + base_prompt
+
+reason_prompt = math_prompt
+
+creation_prompt = """
+请根据提供的 评分要求，用户问题 以及 相应的两个回答（回答1，回答2），判断两个回答中哪一个更好。
+评分要求（重要性依次递减）:
+1. 好的回答必须首先符合用户问题里的各种需求，不能跑题
+2. 好的回答必须具有逻辑连贯性，围绕一个中心进行回答
+3. 好的回答必须具有创造性的词语和表达丰富度
+
+[用户问题]
+{question}
+""" + base_prompt
+
+sub_map = {"creationv3": creation_prompt}
+
+for _name, _prompt in sub_map.items():
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt="{question}"
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            infer_order='double',
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = _prompt
+                    ),
+                ]),
+            ),
+        ),
+        pred_role="BOT",
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=f"{_name}",
+            type=CompassArenaDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
--- a/configs/datasets/taco/taco_gen_c7893a.py
+++ b/configs/datasets/taco/taco_gen_c7893a.py
@@ -19,7 +19,7 @@ TACO_datasets = [
    dict(
        type=TACODataset,
        abbr="TACO",
-        path='BAAI/TACO',
+        path='./data/BAAI-TACO',
        num_repeats = 1,
        reader_cfg=TACO_reader_cfg,
        infer_cfg=TACO_infer_cfg,

--- a/configs/eval_internlm2_chat_keyset.py
+++ b/configs/eval_internlm2_chat_keyset.py
@@ -6,7 +6,7 @@ with read_base():
    from .datasets.agieval.agieval_gen_64afd3 import agieval_datasets
    from .datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
    from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from .datasets.math.math_evaluatorv2_gen_265cce import math_datasets
+    from .datasets.math.math_evaluatorv2_gen_cecb31 import math_datasets
    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
    from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets


--- a/configs/models/mistral/hf_mistral_7b_v0_2.py
+++ b/configs/models/mistral/hf_mistral_7b_v0_2.py
+from opencompass.models import HuggingFaceCausalLM
+
+
+models = [
+    dict(
+        abbr='mistral-7b-v0.2-hf',
+        type=HuggingFaceCausalLM,
+        path='alpindale/Mistral-7B-v0.2-hf',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/nanbeige/hf_nanbeige2_8b_chat.py
+++ b/configs/models/nanbeige/hf_nanbeige2_8b_chat.py
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+    begin="<|im_start|>system\n你是一个名为\"南北阁\"的人工智能助手，正在与人类用户进行交谈。你的目标是以最有帮助和最逻辑的方式回答问题，同时确保内容的安全性。你的回答中不应包含任何有害、政治化、宗教化、不道德、种族主义、非法的内容。请确保你的回答不带有社会偏见，符合社会主义价值观。如果遇到的问题无意义或事实上不连贯，请不要回答错误的内容，而是解释问题为何无效或不连贯。如果你不知道问题的答案，也请勿提供错误的信息。<|im_end|>\n",
+    round=[
+        dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
+        dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='nanbeige2-8b-chat-hf',
+        path="Nanbeige/Nanbeige2-8B-Chat",
+        tokenizer_path='Nanbeige/Nanbeige2-8B-Chat',
+        model_kwargs=dict(
+            device_map='auto',
+            torch_dtype='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='right',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        meta_template=_meta_template,
+        batch_padding=False,
+        max_out_len=100,
+        max_seq_len=4096,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<|im_end|>',
+    )
+]
--- a/configs/models/others/hf_dbrx_instruct.py
+++ b/configs/models/others/hf_dbrx_instruct.py
+
+from opencompass.models import HuggingFaceCausalLM
+
+
+_meta_template = dict(
+    round=[
+        dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
+        dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='dbrx-instruct-hf',
+        path="databricks/dbrx-instruct",
+        model_kwargs=dict(
+            trust_remote_code=True,
+            device_map='auto',
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            use_fast=False,
+            trust_remote_code=True,
+        ),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        meta_template=_meta_template,
+        run_cfg=dict(num_gpus=8, num_procs=1),
+        end_str='<|im_end|>',
+        batch_padding=True,
+    )
+]