Commit c289ecc0 authored by xinghao's avatar xinghao
Browse files

Initial commit

parents
Pipeline #3004 canceled with stages
from mmengine.config import read_base
with read_base():
from .svamp_gen_fb25e4 import svamp_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SVAMPDataset, gsm8k_postprocess, Gsm8kEvaluator
svamp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt="Question: There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups How big is each group of bananas?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt='To find the size of each group of bananas, we divide the total number of bananas (290) by the number of groups (2): 290 / 2 = 145. Therefore, each group of bananas contains 145 bananas. The answer is 145.\n'),
dict(role='HUMAN', prompt="Question: Marco and his dad went strawberry picking. Marco's dad's strawberries weighed 11 pounds. If together their strawberries weighed 30 pounds. How much did Marco's strawberries weigh?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt="To find Marco's strawberries' weight, we subtract his dad's strawberries' weight (11 pounds) from the total weight of their strawberries (30 pounds): 30 - 11 = 19. Therefore, Marco's strawberries weighed 19 pounds. The answer is 19.\n"),
dict(role='HUMAN', prompt="Question: Edward spent $ 6 to buy 2 books each book costing him the same amount of money. Now he has $ 12. How much did each book cost?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt='To find the cost of each book, we subtract the initial amount of money Edward had ($6) from the current amount of money he has ($12) and divide it by the number of books (2): (12 - 6) / 2 = 6 / 2 = 3 Therefore, each book cost $3. The answer is 3.\n'),
dict(role='HUMAN', prompt="Question: Frank was reading through his favorite book. The book had 3 chapters, each with the same number of pages. It has a total of 594 pages. It took Frank 607 days to finish the book. How many pages are in each chapter?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt='To find the number of pages in each chapter, we divide the total number of pages in the book (594) by the number of chapters (3): 594 / 3 = 198. Therefore, each chapter has 198 pages. The answer is 198.\n'),
dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
],
)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
svamp_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess))
svamp_datasets = [
dict(
abbr='svamp',
type=SVAMPDataset,
path='./data/svamp/test.jsonl',
reader_cfg=dict(input_columns=['question'], output_column='answer'),
infer_cfg=svamp_infer_cfg,
eval_cfg=svamp_eval_cfg)
]
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.utils.text_postprocessors import first_option_postprocess
from opencompass.datasets import SciEvalDataset
# 只评测 biology + multiple-choice 的 test split
_hint = ('Given a question and four options, please select the right answer. '
"Your answer should be 'A', 'B', 'C' or 'D'.")
category = [
'biology',
]
scieval_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='test',
)
scieval_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
dict(role='BOT', prompt='{target}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)
scieval_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
)
scieval_datasets = [
dict(
abbr='scieval_biology',
type=SciEvalDataset,
path='OpenDFM/SciEval',
name='default',
category=category,
reader_cfg=scieval_reader_cfg,
infer_cfg=scieval_infer_cfg,
eval_cfg=scieval_eval_cfg,
)
]
# SciEval_lifescience_llmjudge_gen.py
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.utils.text_postprocessors import match_answer_pattern
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import SciEvalDataset
with read_base():
from .SciEval_lifescience_sets import SciEval_lifescience_subsets
category = [
'biology',
]
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD.
{input}
A) {A}
B) {B}
C) {C}
D) {D}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {input}
A) {A}
B) {B}
C) {C}
D) {D}
<Original Question End>
<Gold Target Begin>:
{target}
<Gold Target End>
<Predicted Answer Begin>:
{prediction}
<Predicted End>
Judging the correctness of candidates' answers:
""".strip()
scieval_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='test',
)
scieval_datasets = []
for name in SciEval_lifescience_subsets:
scieval_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]
)
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)
scieval_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt=(
'You are a helpful assistant who evaluates the correctness '
"and quality of models' outputs."
),
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=SciEvalDataset,
path='OpenDFM/SciEval',
name='default',
reader_cfg=scieval_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
scieval_datasets.append(
dict(
abbr=f'scieval_lifescience_{name}_llmjudge',
type=SciEvalDataset,
path='OpenDFM/SciEval',
name='default',
category=category,
reader_cfg=scieval_reader_cfg,
infer_cfg=scieval_infer_cfg,
eval_cfg=scieval_eval_cfg,
mode='singlescore',
)
)
SciEval_lifescience_subsets = [
'biology', # 大学生物学
'physics',
'chemistry'
]
from opencompass.datasets import SciKnowEvalDataset, SciKnowEvalEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
ZERO_SHOT_PROMPT = '{q4}'
# Reader configuration
reader_cfg = dict(
input_columns=[
'prompt',
'question',
'choices',
'label',
'answerKey',
'type',
'domain',
'details',
'answer',
'q4'
],
output_column='answerKey',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg = dict(
evaluator=dict(type=SciKnowEvalEvaluator),
pred_role='BOT',
)
sciknoweval_dataset_biology = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_biology',
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='biology',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
sciknoweval_dataset_chemistry = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_chemistry',
path='hicai-zju/SciKnowEval',
subset='chemistry',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
sciknoweval_dataset_material = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_material',
path='hicai-zju/SciKnowEval',
subset='material',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
sciknoweval_dataset_physics = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_physics',
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='physics',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]
from opencompass.datasets import SciKnowEvalDataset
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.evaluator import GenericLLMEvaluator
ZERO_SHOT_PROMPT = '{q4}'
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: Q: {q4}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answerKey}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Reader configuration
reader_cfg = dict(
input_columns=[
'prompt',
'question',
'choices',
'label',
'answerKey',
'type',
'domain',
'details',
'answer',
'q4'
],
output_column='answerKey',
)
# Inference configuration
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Evaluation configuration
eval_cfg_biology = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=SciKnowEvalDataset,
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='biology',
reader_cfg=reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
eval_cfg_chemistry = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=SciKnowEvalDataset,
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
subset='chemistry',
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
eval_cfg_material = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=SciKnowEvalDataset,
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
subset='material',
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
eval_cfg_physics = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=SciKnowEvalDataset,
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
subset='physics',
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
sciknoweval_dataset_biology = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_biology_llmjudge',
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='biology',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg_biology,
)
sciknoweval_dataset_chemistry = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_chemistry_llmjudge',
path='hicai-zju/SciKnowEval',
subset='chemistry',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg_chemistry,
)
sciknoweval_dataset_material = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_material_llmjudge',
path='hicai-zju/SciKnowEval',
subset='material',
prompt_mode='zero-shot',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg_material,
)
sciknoweval_dataset_physics = dict(
type=SciKnowEvalDataset,
abbr='sciknoweval_physics_llmjudge',
path='hicai-zju/SciKnowEval',
prompt_mode='zero-shot',
subset='physics',
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg_physics,
)
sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]
\ No newline at end of file
from mmengine.config import read_base
with read_base():
from .ScienceQA_llmjudge_gen_f00302 import ScienceQA_datasets
\ No newline at end of file
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets.ScienceQA import ScienceQADataset
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
Question:\n
{question}
Options:\n
{choices}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
ScienceQA_datasets = []
ScienceQA_reader_cfg = dict(
input_columns=['question', 'choices'],
output_column='label',
)
ScienceQA_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
ScienceQA_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=ScienceQADataset,
path='derek-thomas/ScienceQA',
reader_cfg=ScienceQA_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
)
ScienceQA_datasets.append(
dict(
abbr=f'ScienceQA',
type=ScienceQADataset,
path='derek-thomas/ScienceQA',
reader_cfg=ScienceQA_reader_cfg,
infer_cfg=ScienceQA_infer_cfg,
eval_cfg=ScienceQA_eval_cfg,
)
)
\ No newline at end of file
## 🌾 About SeedBench
**SeedBench** is the first multi-task benchmark designed to evaluate large language models (LLMs) in seed science, focusing on seed breeding. This repository includes the dataset, evaluation code, and documentation to support research in this domain.
SeedBench assesses LLMs across three core seed breeding stages:
- **Gene Information Retrieval**
- **Gene Function and Regulation Analysis**
- **Variety Breeding with Agronomic Trait Optimization**
Built with domain experts, SeedBench features **2,264 expert-validated questions** across 11 task types and 10 subcategories, initially targeting rice breeding. Future updates will include other crops like maize, soybean, and wheat.
---
## 🔎 Dataset Details
- **Corpus**: 308,727 publications cleaned to 1.1 billion tokens; 279 segments from 113 documents.
- **Questions**: 4,336 across 11 task types, bilingual (English/Chinese), expert-validated.
- **Focus**: Rice breeding as a representative case.
**Types and metrics:**
<div align="center">
| Type ID | Question Type | Metric | Count |
|---------|----------------------------|----------|-------|
| **Q&A** | | | |
| QA-1 | Multiple Choice | Accuracy | 354 |
| QA-2 | Multiple Answer | Macro-F1 | 291 |
| QA-3 | Fill-in-the-Blank | ROUGE-L | 426 |
| QA-4 | Generation | ROUGE-L | 403 |
| **Summarization** | | | |
| SUM-1 | Simple Summarization | ROUGE-L | 638 |
| SUM-2 | Key Information Extraction | ROUGE-L | 638 |
| **Reading Comprehension** | | | |
| RC-1 | Multiple Choice | Accuracy | 268 |
| RC-2 | Multiple Answer | Macro-F1 | 212 |
| RC-3 | Fill-in-the-Blank | ROUGE-L | 424 |
| RC-4 | Generation | ROUGE-L | 403 |
| RC-5 | Subcategory Classification | Accuracy | 279 |
</div>
---
## 📂 Dataset Links
- [SeedBench on Github](https://github.com/open-sciencelab/SeedBench)
- [SeedBench on Hugging Face](https://huggingface.co/datasets/yj12869741/SeedBench)
- [SeedBench on ModelScope](https://www.modelscope.cn/datasets/y12869741/SeedBench/summary)
---
## ☀️ Key Results
We evaluated 26 LLMs, including proprietary, open-source, and domain-specific models. Highlights:
### Performance by Question Type
- **Top Performers**: DeepSeek-V3 (68.37), GPT-4 (67.88).
### Performance by Task Types
| Model | QA-1 | QA-2 | QA-3 | QA-4 | SUM-1 | SUM-2 | RC-1 | RC-2 | RC-3 | RC-4 | RC-5 | Avg |
|------------------|------|------|------|------|-------|-------|------|------|------|------|------|------|
| GPT-4 | 60.50| 73.87| 21.35| 36.07| 58.73 | 62.89 | 100.00| 96.44| 87.86| 62.29| 86.74| 67.88|
| DeepSeek-V3 | 72.50| 79.84| 29.29| 40.63| 48.06 | 54.67 | 100.00| 97.22| 87.89| 55.19| 86.74| 68.37|
| Qwen2-72B | 59.50| 75.98| 19.55| 31.62| 31.08 | 63.09 | 99.12 | 94.24| 72.20| 51.58| 89.96| 62.54|
### Performance by Subcategory
| Model | C1 | C2 | C3 | C4 | C5 | C6 | C7 | C8 | C9 | C10 | Avg |
|-------------------|------|------|------|------|------|------|------|------|------|------|------|
| GPT-4 | 59.59| 60.55| 76.32| 61.16| 56.34| 59.35| 63.67| 64.74| 60.65| 67.66| 62.06|
| DeepSeek-V3-671B | 56.03| 62.42| 74.81| 63.17| 55.23| 58.84| 68.23| 69.04| 66.46| 68.48| 63.30|
| Qwen2-72B | 51.16| 58.10| 74.07| 59.72| 51.58| 57.76| 58.85| 61.63| 56.69| 59.11| 57.62|
- **Top Performers**: DeepSeek-V3-671B (63.30), GPT-4 (62.06).
from mmengine.config import read_base
with read_base():
# Default use LLM as a judge
from .seedbench_gen_5d5ea1 import seedbench_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator, JiebaRougeEvaluator, RougeEvaluator
from opencompass.datasets import SeedBenchDataset, F1ScoreEvaluator, my_multiple_select_postprocess, AverageRougeScoreEvaluator
from opencompass.utils.text_postprocessors import first_option_postprocess
agri_reader_cfg = dict(
input_columns=['instruction', 'question'],
output_column='answer'
)
agri_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{instruction}\n{question}\n'
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
default_dataset_cfg = {
'type': SeedBenchDataset,
'path': 'json',
'reader_cfg': agri_reader_cfg,
'infer_cfg': agri_infer_cfg,
}
dataset_configs = [
# 1-n
{'abbr': 'seedbench_1-1', 'data_file': '1-1.json', 'evaluator': 'AccEvaluator',
'pred_postprocessor': dict(type=first_option_postprocess, options='ABCD')},
{'abbr': 'seedbench_1-2', 'data_file': '1-2.json', 'evaluator': 'F1ScoreEvaluator',
'pred_postprocessor': dict(type=my_multiple_select_postprocess)},
# {'abbr': 'seedbench_1-3_em', 'data_file': '1-3.json', 'evaluator': 'ExactMatchScoreEvaluator'},
{'abbr': 'seedbench_1-3', 'data_file': '1-3.json', 'evaluator': 'AverageRougeScoreEvaluator'},
{'abbr': 'seedbench_1-4', 'data_file': '1-4.json', 'evaluator': 'RougeEvaluator'},
# # 2-n
{'abbr': 'seedbench_2-1', 'data_file': '2-1.json', 'evaluator': 'RougeEvaluator'},
{'abbr': 'seedbench_2-2', 'data_file': '2-2.json', 'evaluator': 'RougeEvaluator'},
# 3-n
{'abbr': 'seedbench_3-1', 'data_file': '3-1.json', 'evaluator': 'AccEvaluator',
'pred_postprocessor': dict(type=first_option_postprocess, options='ABCD')},
{'abbr': 'seedbench_3-2', 'data_file': '3-2.json', 'evaluator': 'F1ScoreEvaluator',
'pred_postprocessor': dict(type=my_multiple_select_postprocess)},
# {'abbr': 'seedbench_3-3_em', 'data_file': '3-3.json', 'evaluator': 'ExactMatchScoreEvaluator'},
{'abbr': 'seedbench_3-3', 'data_file': '3-3.json', 'evaluator': 'AverageRougeScoreEvaluator'},
{'abbr': 'seedbench_3-4', 'data_file': '3-4.json', 'evaluator': 'RougeEvaluator'},
{'abbr': 'seedbench_3-5', 'data_file': '3-5.json', 'evaluator': 'AccScoreStr_Evaluator'},
]
seedbench_datasets = []
for stage in ['zero-shot','one-shot']:
for config in dataset_configs:
eval_cfg = dict(
evaluator=dict(type=config['evaluator'])
)
if 'pred_postprocessor' in config:
eval_cfg['pred_postprocessor'] = config['pred_postprocessor']
data_file = f"{stage}/{config['data_file']}"
abbr_name = f"{config['abbr']}_{stage}"
seedbench_datasets.append(
dict(
type=SeedBenchDataset,
abbr=abbr_name,
data_files=data_file,
path='opencompass/seedbench',
reader_cfg=agri_reader_cfg,
infer_cfg=agri_infer_cfg,
eval_cfg=eval_cfg
)
)
# OpenCompass SimpleQA dataset config for evaluation
## 1. Introduction
SimpleQA is a benchmark that evaluates the ability of language models to answer short, fact-seeking questions by OpenAI.
The original site is https://github.com/openai/simple-evals.
## 2. How to use
Please refer to the demo evaluation script `/opencompass/configs/mine/simpleqa_eval.py`.
from mmengine.config import read_base
with read_base():
from .simpleqa_gen_0283c3 import simpleqa_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import SimpleQADataset, simpleqa_postprocess
GRADER_TEMPLATE = """
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
First, I will give examples of each grade, and then you will grade a new example.
The following are examples of CORRECT predicted answers.
```
Question: What are the names of Barack Obama's children?
Gold target: Malia Obama and Sasha Obama
Predicted answer 1: sasha and malia obama
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
```
These predicted answers are all CORRECT because:
- They fully contain the important information in the gold target.
- They do not contain any information that contradicts the gold target.
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
The following are examples of INCORRECT predicted answers.
```
Question: What are the names of Barack Obama's children?
Gold target: Malia and Sasha
Predicted answer 1: Malia.
Predicted answer 2: Malia, Sasha, and Susan.
Predicted answer 3: Barack Obama does not have any children.
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
```
These predicted answers are all INCORRECT because:
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
The following are examples of NOT_ATTEMPTED predicted answers.
```
Question: What are the names of Barack Obama's children?
Gold target: Malia and Sasha
Predicted answer 1: I don't know.
Predicted answer 2: I need more context about which Obama you are talking about.
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
```
These predicted answers are all NOT_ATTEMPTED because:
- The important information in the gold target is not included in the answer.
- No statements in the answer contradict the gold target.
Also note the following things:
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
- Predicted answers "120k", "124k", and 115k" are all CORRECT.
- Predicted answers "100k" and "113k" are INCORRECT.
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
- Do not punish for typos in people's name if it's clearly the same name.
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
C: NOT_ATTEMPTED
Just return the letters "A", "B", or "C", with no text around it.
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
```
Question: {problem}
Gold target: {answer}
Predicted answer: {prediction}
```
""".strip()
simpleqa_reader_cfg = dict(input_columns=['problem'], output_column='answer')
simpleqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt="Question: {problem}\nLet's think step by step:"),
],
)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048))
simpleqa_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dict_postprocessor=dict(type=simpleqa_postprocess),
),
pred_role='BOT',
)
simpleqa_datasets = [
dict(
abbr='simpleqa',
type=SimpleQADataset,
path='opencompass/simpleqa',
reader_cfg=simpleqa_reader_cfg,
infer_cfg=simpleqa_infer_cfg,
eval_cfg=simpleqa_eval_cfg,
mode='singlescore',
)
]
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_0shot_instruct import nc_0shot_instruct_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_0_shot_instruct import pp_acc_datasets_0shot_instruct
from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_0shot_instruct import pp_rmse_0shot_instruct_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_0shot_instruct import fts_0shot_instruct_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_0shot_instruct import meteor_0shot_instruct_datasets
smolinstruct_datasets_0shot_instruct = nc_0shot_instruct_datasets + pp_rmse_0shot_instruct_datasets + pp_acc_datasets_0shot_instruct + meteor_0shot_instruct_datasets + fts_0shot_instruct_datasets
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import FTSEvaluator
from opencompass.datasets import SmolInstructDataset
fts_0shot_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')
fts_hint_dict = {
'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule.
The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge.
The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain the SMILES representation of the predicted product wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge.
The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and all reactants and reagents should be enclosed **together** within a single pair of <SMILES> and </SMILES> tags, separated by ".". Your reply must be valid and chemically reasonable.""",
}
name_dict = {
'MG': 'molecule_generation',
'FS': 'forward_synthesis',
'RS': 'retrosynthesis'
}
fts_0shot_instruct_datasets = []
for _name in name_dict:
_hint = fts_hint_dict[_name]
fts_0shot_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '),
dict(role='BOT', prompt='{output}\n')
]),
# template=f'<s>[INST] {{input}} [/INST]',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
fts_0shot_eval_cfg = dict(
evaluator=dict(type=FTSEvaluator),
)
fts_0shot_instruct_datasets.append(
dict(
abbr=f'{_name}-0shot-instruct',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=fts_0shot_reader_cfg,
infer_cfg=fts_0shot_infer_cfg,
eval_cfg=fts_0shot_eval_cfg,
))
del _name
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import FTSEvaluator
from opencompass.datasets import SmolInstructDataset
fts_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')
fts_hint_dict = {
'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule.
The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge.
The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain only the SMILES representation of the predicted product and no other text. Your reply must be valid and chemically reasonable.""",
'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge.
The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and different reactants and reagents should be separated by ".". Your reply must be valid and chemically reasonable.""",
}
name_dict = {
'MG': 'molecule_generation',
'FS': 'forward_synthesis',
'RS': 'retrosynthesis'
}
fts_datasets = []
for _name in fts_hint_dict:
_hint = fts_hint_dict[_name]
fts_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
dict(role='BOT', prompt='{output}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
inferencer=dict(type=GenInferencer),
)
fts_eval_cfg = dict(
evaluator=dict(type=FTSEvaluator),
)
fts_datasets.append(
dict(
abbr=f'{_name}',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=fts_reader_cfg,
infer_cfg=fts_infer_cfg,
eval_cfg=fts_eval_cfg,
))
del _name, _hint
\ No newline at end of file
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_gen_c84c18 import nc_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_gen_8607a3 import pp_acc_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_gen_0fcc6b import pp_rmse_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_gen_5774b5 import fts_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_gen_065150 import meteor_datasets
smolinstruct_datasets = nc_datasets + pp_rmse_datasets + pp_acc_datasets + meteor_datasets + fts_datasets
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import MeteorEvaluator
from opencompass.datasets import SmolInstructDataset
meteor_0shot_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')
meteor_hint_dict = {
'MC': """You are an expert chemist. Given the SMILES representation of a molecule, your task is to describe the molecule in natural language.
The input contains the SMILES representation of the molecule. Your reply should contain a natural language description of the molecule. Your reply must be valid and chemically reasonable.""",
}
name_dict = {
'MC': 'molecule_captioning',
}
meteor_0shot_instruct_datasets = []
for _name in name_dict:
_hint = meteor_hint_dict[_name]
meteor_0shot_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '),
dict(role='BOT', prompt='{output}\n')
]),
# template=f'<s>[INST] {{input}} [/INST]',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
meteor_0shot_eval_cfg = dict(
evaluator=dict(type=MeteorEvaluator),
)
meteor_0shot_instruct_datasets.append(
dict(
abbr=f'{_name}-0shot-instruct',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=meteor_0shot_reader_cfg,
infer_cfg=meteor_0shot_infer_cfg,
eval_cfg=meteor_0shot_eval_cfg,
))
del _name
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment