add evaluation of scibench (#393)

* add evaluation of scibench * add evaluation of scibench * update scibench * remove scibench evaluator --------- Co-authored-by: Leymore <zfz-960727@163.com>

add evaluation of scibench (#393)
* add evaluation of scibench * add evaluation of scibench * update scibench * remove scibench evaluator --------- Co-authored-by: Leymore <zfz-960727@163.com>
2a62bea1 · TTTTTiam · GitHub · 07574fdd · 2a62bea1 · 2a62bea1
Unverified Commit 2a62bea1 authored Sep 22, 2023 by TTTTTiam Committed by GitHub Sep 22, 2023
7 changed files
--- a/configs/datasets/scibench/scibench_gen.py
+++ b/configs/datasets/scibench/scibench_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .scibench_gen_2b21f3 import scibench_datasets  # noqa: F401, F403
--- a/configs/datasets/scibench/scibench_gen_2b21f3.py
+++ b/configs/datasets/scibench/scibench_gen_2b21f3.py
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ScibenchDataset, scibench_postprocess
+
+scibench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+scibench_subsets = [
+    "atkins",
+    "calculus",
+    "chemmc",
+    "class",
+    "diff",
+    "fund",
+    "matter",
+    "quan",
+    "stat",
+    "thermo"
+]
+
+scibench_datasets = []
+for prompt_type in ["zs", "zs-cot", "fs", "fs-cot"]:
+    for _name in scibench_subsets:
+        if prompt_type == "fs":
+            prompt_path = os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}_prompt.txt')
+        elif prompt_type == "fs-cot":
+            prompt_path = os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}_sol.txt')
+        else:
+            prompt_path = None
+        if prompt_path is not None:
+            with open(prompt_path, 'r') as f:
+                _hint = f.read()
+        else:
+            _hint = ""
+
+        human_prompt = {
+            'zs': "Please provide a clear and step-by-step solution for a scientific problem in the categories of Chemistry, Physics, or Mathematics. The problem will specify the unit of measurement, which should not be included in the answer. Express the final answer as a decimal number with three digits after the decimal point. Conclude the answer by stating 'Therefore, the answer is \\boxed[ANSWER].'\n\nProblem: {question}\nAnswer:",
+            'zs-cot': "Please provide a clear and step-by-step solution for a scientific problem in the categories of Chemistry, Physics, or Mathematics. The problem will specify the unit of measurement, which should not be included in the answer. Express the final answer as a decimal number with three digits after the decimal point. Conclude the answer by stating 'Therefore, the answer is \\boxed[ANSWER].'\n\nProblem: {question}\nAnswer:Let’s think step by step.",
+            'fs': f"{_hint}\n\nProblem 6: {{question}}\nAnswer: ",
+            'fs-cot': f"{_hint}\n\nProblem 6: {{question}}\nExplanation for Problem 6: ",
+        }[prompt_type]
+
+        scibench_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(role="HUMAN", prompt=human_prompt)
+                ])
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=512)
+        )
+
+        scibench_eval_cfg = dict(
+            evaluator=dict(type=AccEvaluator),
+            pred_postprocessor=dict(type=scibench_postprocess))
+
+
+        scibench_datasets.append(
+            dict(
+                type=ScibenchDataset,
+                path="./data/scibench",
+                name=_name,
+                abbr= f"scibench-{_name}" if prompt_type == 'zs' else f"scibench-{_name}_{prompt_type}",
+                reader_cfg=scibench_reader_cfg,
+                infer_cfg=scibench_infer_cfg.copy(),
+                eval_cfg=scibench_eval_cfg.copy()
+            )
+        )
--- a/configs/summarizers/example.py
+++ b/configs/summarizers/example.py
@@ -10,6 +10,7 @@ with read_base():
    from .groups.flores import flores_summary_groups
    from .groups.tydiqa import tydiqa_summary_groups
    from .groups.xiezhi import xiezhi_summary_groups
+    from .groups.scibench import scibench_summary_groups

 summarizer = dict(
    summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),

--- a/configs/summarizers/groups/scibench.py
+++ b/configs/summarizers/groups/scibench.py
+scibench_summary_groups = []
+
+scibench_tasks = ["atkins", "calculus", "chemmc", "class", "diff", "fund", "matter", "quan", "stat", "thermo"]
+for suffix in ["", "_zs-cot", "_fs", "_fs-cot"]:
+    subsets = [f"scibench-{subset}{suffix}" for subset in scibench_tasks]
+    scibench_summary_groups.append({'name': f'scibench{suffix}', 'subsets': subsets})
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -56,6 +56,7 @@ from .race import *  # noqa: F401, F403
 from .realtoxicprompts import *  # noqa: F401, F403
 from .record import *  # noqa: F401, F403
 from .safety import *  # noqa: F401, F403
+from .scibench import ScibenchDataset, scibench_postprocess  # noqa: F401, F403
 from .siqa import *  # noqa: F401, F403
 from .squad20 import SQuAD20Dataset, SQuAD20Evaluator  # noqa: F401, F403
 from .storycloze import *  # noqa: F401, F403

--- a/opencompass/datasets/scibench.py
+++ b/opencompass/datasets/scibench.py
+import json
+import os.path as osp
+import re
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class ScibenchDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str):
+        train_data = []
+
+        filename = osp.join(path, f'{name}.json')
+        with open(filename, 'r') as infile:
+            raw_data = json.load(infile)
+
+        for entry in raw_data:
+            train_data.append({
+                'question': entry['problem_text'].strip(),
+                'answer': entry['answer_number'].strip()
+            })
+
+        dataset = Dataset.from_list(train_data)
+        return dataset
+
+
+@TEXT_POSTPROCESSORS.register_module('scibench')
+def scibench_postprocess(text: str) -> str:
+    ans = text
+    ans_line = ans.split('answer is')
+    if len(ans_line) != 1:
+        ans = ans_line[1].strip()
+
+    match = re.search(r'\\boxed\{(.+?)\}', ans)
+    if match:
+        extracted_content = match.group(1)
+        return extracted_content
+
+    output = re.sub(r'(\d),(\d)', r'\1\2', ans)
+    numbers = re.findall(r'-?\d*\.?\d+|\d+', output)
+    if numbers:
+        return numbers[-1]
+
+    return ans
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -141,7 +141,7 @@ class OpenICLEvalTask(BaseTask):
                    filename = root + f'_{i}' + ext
                    i += 1

-            preds = {k: [pred[k] for pred in preds] for k in preds[0]}
+            preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}

            pred_strs = preds.pop('prediction')