Unverified Commit 2a62bea1 authored by TTTTTiam's avatar TTTTTiam Committed by GitHub
Browse files

add evaluation of scibench (#393)



* add evaluation of scibench

* add evaluation of scibench

* update scibench

* remove scibench evaluator

---------
Co-authored-by: default avatarLeymore <zfz-960727@163.com>
parent 07574fdd
from mmengine.config import read_base
with read_base():
from .scibench_gen_2b21f3 import scibench_datasets # noqa: F401, F403
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import ScibenchDataset, scibench_postprocess
scibench_reader_cfg = dict(input_columns=['question'], output_column='answer')
scibench_subsets = [
"atkins",
"calculus",
"chemmc",
"class",
"diff",
"fund",
"matter",
"quan",
"stat",
"thermo"
]
scibench_datasets = []
for prompt_type in ["zs", "zs-cot", "fs", "fs-cot"]:
for _name in scibench_subsets:
if prompt_type == "fs":
prompt_path = os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}_prompt.txt')
elif prompt_type == "fs-cot":
prompt_path = os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}_sol.txt')
else:
prompt_path = None
if prompt_path is not None:
with open(prompt_path, 'r') as f:
_hint = f.read()
else:
_hint = ""
human_prompt = {
'zs': "Please provide a clear and step-by-step solution for a scientific problem in the categories of Chemistry, Physics, or Mathematics. The problem will specify the unit of measurement, which should not be included in the answer. Express the final answer as a decimal number with three digits after the decimal point. Conclude the answer by stating 'Therefore, the answer is \\boxed[ANSWER].'\n\nProblem: {question}\nAnswer:",
'zs-cot': "Please provide a clear and step-by-step solution for a scientific problem in the categories of Chemistry, Physics, or Mathematics. The problem will specify the unit of measurement, which should not be included in the answer. Express the final answer as a decimal number with three digits after the decimal point. Conclude the answer by stating 'Therefore, the answer is \\boxed[ANSWER].'\n\nProblem: {question}\nAnswer:Let’s think step by step.",
'fs': f"{_hint}\n\nProblem 6: {{question}}\nAnswer: ",
'fs-cot': f"{_hint}\n\nProblem 6: {{question}}\nExplanation for Problem 6: ",
}[prompt_type]
scibench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role="HUMAN", prompt=human_prompt)
])
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512)
)
scibench_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=scibench_postprocess))
scibench_datasets.append(
dict(
type=ScibenchDataset,
path="./data/scibench",
name=_name,
abbr= f"scibench-{_name}" if prompt_type == 'zs' else f"scibench-{_name}_{prompt_type}",
reader_cfg=scibench_reader_cfg,
infer_cfg=scibench_infer_cfg.copy(),
eval_cfg=scibench_eval_cfg.copy()
)
)
......@@ -10,6 +10,7 @@ with read_base():
from .groups.flores import flores_summary_groups
from .groups.tydiqa import tydiqa_summary_groups
from .groups.xiezhi import xiezhi_summary_groups
from .groups.scibench import scibench_summary_groups
summarizer = dict(
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
......
scibench_summary_groups = []
scibench_tasks = ["atkins", "calculus", "chemmc", "class", "diff", "fund", "matter", "quan", "stat", "thermo"]
for suffix in ["", "_zs-cot", "_fs", "_fs-cot"]:
subsets = [f"scibench-{subset}{suffix}" for subset in scibench_tasks]
scibench_summary_groups.append({'name': f'scibench{suffix}', 'subsets': subsets})
......@@ -56,6 +56,7 @@ from .race import * # noqa: F401, F403
from .realtoxicprompts import * # noqa: F401, F403
from .record import * # noqa: F401, F403
from .safety import * # noqa: F401, F403
from .scibench import ScibenchDataset, scibench_postprocess # noqa: F401, F403
from .siqa import * # noqa: F401, F403
from .squad20 import SQuAD20Dataset, SQuAD20Evaluator # noqa: F401, F403
from .storycloze import * # noqa: F401, F403
......
import json
import os.path as osp
import re
from datasets import Dataset
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
from .base import BaseDataset
@LOAD_DATASET.register_module()
class ScibenchDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
train_data = []
filename = osp.join(path, f'{name}.json')
with open(filename, 'r') as infile:
raw_data = json.load(infile)
for entry in raw_data:
train_data.append({
'question': entry['problem_text'].strip(),
'answer': entry['answer_number'].strip()
})
dataset = Dataset.from_list(train_data)
return dataset
@TEXT_POSTPROCESSORS.register_module('scibench')
def scibench_postprocess(text: str) -> str:
ans = text
ans_line = ans.split('answer is')
if len(ans_line) != 1:
ans = ans_line[1].strip()
match = re.search(r'\\boxed\{(.+?)\}', ans)
if match:
extracted_content = match.group(1)
return extracted_content
output = re.sub(r'(\d),(\d)', r'\1\2', ans)
numbers = re.findall(r'-?\d*\.?\d+|\d+', output)
if numbers:
return numbers[-1]
return ans
......@@ -141,7 +141,7 @@ class OpenICLEvalTask(BaseTask):
filename = root + f'_{i}' + ext
i += 1
preds = {k: [pred[k] for pred in preds] for k in preds[0]}
preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
pred_strs = preds.pop('prediction')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment