Unverified Commit 34561ece authored by philipwangOvO's avatar philipwangOvO Committed by GitHub
Browse files

[Feature] Add InfiniteBench (#739)



* add InfiniteBench

* add InfiniteBench

---------
Co-authored-by: default avatarwangchonghua <wangchonghua@pjlab.org.cn>
parent 3a68083e
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import InfiniteBenchretrievenumberDataset
from opencompass.datasets.infinitebench.utils import InfiniteBench_first_number_postprocess
InfiniteBench_retrievenumber_reader_cfg = dict(
input_columns=['context', 'input'],
output_column='answer',
)
InfiniteBench_retrievenumber_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt='You are a helpful assistant.'),
],
round=[
dict(role='HUMAN', prompt='There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n\n{input}'),
dict(role='BOT', prompt=''),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=12)
)
InfiniteBench_retrievenumber_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=InfiniteBench_first_number_postprocess),
pred_role='BOT'
)
InfiniteBench_retrievenumber_datasets = [
dict(
type=InfiniteBenchretrievenumberDataset,
abbr='InfiniteBench_retrievenumber',
path='./data/InfiniteBench/number_string.jsonl',
reader_cfg=InfiniteBench_retrievenumber_reader_cfg,
infer_cfg=InfiniteBench_retrievenumber_infer_cfg,
eval_cfg=InfiniteBench_retrievenumber_eval_cfg)
]
from mmengine.config import read_base
with read_base():
from .infinitebench_retrievepasskey_gen_62ff68 import InfiniteBench_retrievepasskey_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import InfiniteBenchretrievepasskeyDataset
from opencompass.datasets.infinitebench.utils import InfiniteBench_first_number_postprocess
InfiniteBench_retrievepasskey_reader_cfg = dict(
input_columns=['context', 'input'],
output_column='answer',
)
InfiniteBench_retrievepasskey_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt='You are a helpful assistant.'),
],
round=[
dict(role='HUMAN', prompt='There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n\n{input}'),
dict(role='BOT', prompt=''),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=6)
)
InfiniteBench_retrievepasskey_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=InfiniteBench_first_number_postprocess),
pred_role='BOT'
)
InfiniteBench_retrievepasskey_datasets = [
dict(
type=InfiniteBenchretrievepasskeyDataset,
abbr='InfiniteBench_retrievepasskey',
path='./data/InfiniteBench/passkey.jsonl',
reader_cfg=InfiniteBench_retrievepasskey_reader_cfg,
infer_cfg=InfiniteBench_retrievepasskey_infer_cfg,
eval_cfg=InfiniteBench_retrievepasskey_eval_cfg)
]
from mmengine.config import read_base
with read_base():
from .infinitebench_zhqa_gen_1e5293 import InfiniteBench_zhqa_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import InfiniteBenchzhqaDataset, LongBenchF1Evaluator
from opencompass.utils.text_postprocessors import general_cn_postprocess
InfiniteBench_zhqa_reader_cfg = dict(
input_columns=['context', 'question'],
output_column='answer',
)
InfiniteBench_zhqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(role='SYSTEM', fallback_role='HUMAN', prompt='You are a helpful assistant.'),
],
round=[
dict(role='HUMAN', prompt='请根据以下书籍回答我的问题。\n\n{context}\n\n问题:{question}\n请尽量简短地回答。'),
dict(role='BOT', prompt=''),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=40)
)
InfiniteBench_zhqa_eval_cfg = dict(
evaluator=dict(type=LongBenchF1Evaluator, language='zh'),
pred_role='BOT',
)
InfiniteBench_zhqa_datasets = [
dict(
type=InfiniteBenchzhqaDataset,
abbr='InfiniteBench_zhqa',
path='./data/InfiniteBench/longbook_qa_chn.jsonl',
reader_cfg=InfiniteBench_zhqa_reader_cfg,
infer_cfg=InfiniteBench_zhqa_infer_cfg,
eval_cfg=InfiniteBench_zhqa_eval_cfg)
]
infinitebench_summary_groups = []
_infinitebench = ['codedebug', 'coderun', 'endia', 'enmc', 'enqa', 'ensum', 'mathcalc', 'mathfind', 'retrievekv', 'retrievenumber', 'retrievepasskey', 'zhqa']
_infinitebench = ['InfiniteBench_' + s for s in _infinitebench]
infinitebench_summary_groups.append({'name': 'InfiniteBench', 'subsets': _infinitebench})
from mmengine.config import read_base
with read_base():
from .groups.infinitebench import infinitebench_summary_groups
summarizer = dict(
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
)
...@@ -45,6 +45,7 @@ from .hellaswag import * # noqa: F401, F403 ...@@ -45,6 +45,7 @@ from .hellaswag import * # noqa: F401, F403
from .huggingface import * # noqa: F401, F403 from .huggingface import * # noqa: F401, F403
from .humaneval import * # noqa: F401, F403 from .humaneval import * # noqa: F401, F403
from .humanevalx import * # noqa: F401, F403 from .humanevalx import * # noqa: F401, F403
from .infinitebench import * # noqa: F401, F403
from .iwslt2017 import * # noqa: F401, F403 from .iwslt2017 import * # noqa: F401, F403
from .jigsawmultilingual import * # noqa: F401, F403 from .jigsawmultilingual import * # noqa: F401, F403
from .kaoshi import KaoshiDataset, KaoshiEvaluator # noqa: F401, F403 from .kaoshi import KaoshiDataset, KaoshiEvaluator # noqa: F401, F403
......
from .infinitebench_codedebug import * # noqa: F401, F403
from .infinitebench_coderun import * # noqa: F401, F403
from .infinitebench_endia import * # noqa: F401, F403
from .infinitebench_enmc import * # noqa: F401, F403
from .infinitebench_enqa import * # noqa: F401, F403
from .infinitebench_ensum import * # noqa: F401, F403
from .infinitebench_mathcalc import * # noqa: F401, F403
from .infinitebench_mathfind import * # noqa: F401, F403
from .infinitebench_retrievekv import * # noqa: F401, F403
from .infinitebench_retrievenumber import * # noqa: F401, F403
from .infinitebench_retrievepasskey import * # noqa: F401, F403
from .infinitebench_zhqa import * # noqa: F401, F403
from .utils import * # noqa: F401, F403
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchcodedebugDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
question = item['input']
option_A = item['options'][0]
option_B = item['options'][1]
option_C = item['options'][2]
option_D = item['options'][3]
answer = chr(item['options'].index(item['answer'][0]) + ord('A'))
raw_data.append({
'context': context,
'question': question,
'option_A': option_A,
'option_B': option_B,
'option_C': option_C,
'option_D': option_D,
'answer': answer
})
dataset = Dataset.from_list(raw_data)
return dataset
import re
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchcoderunDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
find_result = re.findall(r'func_[0-9]+\(\-?[0-9]+\)',
item['input'])
func_call = find_result[0]
func = func_call.split('(')[0]
answer = item['answer']
raw_data.append({
'context': context,
'func': func,
'func_call': func_call,
'answer': answer
})
dataset = Dataset.from_list(raw_data)
return dataset
from typing import List
from datasets import Dataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchendiaDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
question = item['input']
answer = item['answer']
raw_data.append({
'context': context,
'question': question,
'answer': answer
})
dataset = Dataset.from_list(raw_data)
return dataset
@ICL_EVALUATORS.register_module()
class InfiniteBenchendiaEvaluator(BaseEvaluator):
def score(self, predictions: List, references: List) -> dict:
score = 0.
for i in range(len(predictions)):
prediction = predictions[i]
reference = references[i][0]
for c in ['\n', ':', '"', "'", '.', ',', '?', '!', '{', '}']:
prediction = prediction.replace(c, ' ')
words = prediction.split()
words = [x.upper() for x in words]
if reference in words:
score += 1
score = score / len(predictions) * 100
return {'score': score}
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchenmcDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
question = item['input']
option_A = item['options'][0]
option_B = item['options'][1]
option_C = item['options'][2]
option_D = item['options'][3]
answer = chr(item['options'].index(item['answer'][0]) + ord('A'))
raw_data.append({
'context': context,
'question': question,
'option_A': option_A,
'option_B': option_B,
'option_C': option_C,
'option_D': option_D,
'answer': answer
})
dataset = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchenqaDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
question = item['input']
answer = item['answer']
raw_data.append({
'context': context,
'question': question,
'answer': answer
})
dataset = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchensumDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
answer = item['answer']
raw_data.append({'context': context, 'answer': answer})
dataset = Dataset.from_list(raw_data)
return dataset
import re
from typing import List
from datasets import Dataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchmathcalcDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
answer = item['answer']
raw_data.append({'context': context, 'answer': answer})
dataset = Dataset.from_list(raw_data)
return dataset
@ICL_EVALUATORS.register_module()
class InfiniteBenchmathcalcEvaluator(BaseEvaluator):
def score(self, predictions: List, references: List) -> dict:
score = 0.
for i in range(len(predictions)):
prediction = predictions[i]
reference = references[i]
prediction_nums = []
prediction_list = re.split('[^0-9]', prediction)
for item in prediction_list:
if item != '':
prediction_nums.append(int(item))
for j in range(len(reference)):
if j >= len(prediction_nums):
break
if reference[j] == prediction_nums[j]:
score += 1
else:
break
score = score / len(predictions) * 100
return {'score': score}
import re
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchmathfindDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
answer = item['answer']
find_result = re.findall(r'The .+ of', item['input'])
target_number = find_result[0].lower()[:-3]
prefix = f'What is {target_number} in the following list?'
raw_data.append({
'prefix': prefix,
'context': context,
'answer': answer
})
dataset = Dataset.from_list(raw_data)
return dataset
from typing import List
from datasets import Dataset
from opencompass.openicl import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchretrievekvDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
input = item['input']
answer = item['answer']
raw_data.append({
'context': context,
'input': input,
'answer': answer
})
dataset = Dataset.from_list(raw_data)
return dataset
@ICL_EVALUATORS.register_module()
class InfiniteBenchretrievekvEvaluator(BaseEvaluator):
def score(self, predictions: List, references: List) -> dict:
score = 0.
for i in range(len(predictions)):
prediction = predictions[i]
reference = references[i]
for c in ['\n', ':', '\"', '\'', '.', ',', '?', '!', '{', '}']:
prediction = prediction.replace(c, ' ')
words = prediction.split()
if reference in words:
score += 1
score = score / len(predictions) * 100
return {'score': score}
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchretrievenumberDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
input = item['input']
answer = item['answer']
raw_data.append({
'context': context,
'input': input,
'answer': answer
})
dataset = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
from .utils import iter_jsonl
@LOAD_DATASET.register_module()
class InfiniteBenchretrievepasskeyDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = list(iter_jsonl(path))
raw_data = []
for item in dataset:
context = item['context']
input = item['input']
answer = item['answer']
raw_data.append({
'context': context,
'input': input,
'answer': answer
})
dataset = Dataset.from_list(raw_data)
return dataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment