Unverified Commit 655a807f authored by philipwangOvO's avatar philipwangOvO Committed by GitHub
Browse files

[Dataset] LongBench (#236)


Co-authored-by: default avatarwangchonghua <wangchonghua@pjlab.org.cn>
parent c6a34949
summarizer = dict(
dataset_abbrs = [
'--------- LongBench Single-Document QA ---------', # category
"LongBench_narrativeqa",
'LongBench_qasper',
'LongBench_multifieldqa_en',
"LongBench_multifieldqa_zh",
'--------- LongBench Multi-Document QA ---------', # category
'LongBench_hotpotqa',
'LongBench_2wikimqa',
'LongBench_musique',
'LongBench_dureader',
'--------- LongBench Summarization ---------', # category
'LongBench_gov_report',
'LongBench_qmsum',
'LongBench_vcsum',
'--------- LongBench Few-shot Learning ---------', # category
'LongBench_trec',
'LongBench_nq',
'LongBench_triviaqa',
'LongBench_lsht',
'--------- LongBench Code Completion ---------', # category
'LongBench_lcc',
'LongBench_repobench-p',
'--------- LongBench Synthetic Tasks ---------', # category
'LongBench_passage_retrieval_en',
'LongBench_passage_count',
'LongBench_passage_retrieval_zh',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
prompt_db=dict(
database_path='configs/datasets/log.json',
config_dir='configs/datasets',
blacklist='.promptignore'),
)
...@@ -54,6 +54,7 @@ from .LEval_scientific_qa import * # noqa: F401, F403 ...@@ -54,6 +54,7 @@ from .LEval_scientific_qa import * # noqa: F401, F403
from .LEval_topic_retrieval import * # noqa: F401, F403 from .LEval_topic_retrieval import * # noqa: F401, F403
from .LEval_tpo import * # noqa: F401, F403 from .LEval_tpo import * # noqa: F401, F403
from .LEval_tvshow_summ import * # noqa: F401, F403 from .LEval_tvshow_summ import * # noqa: F401, F403
from .longbench import * # noqa: F401, F403
from .math import * # noqa: F401, F403 from .math import * # noqa: F401, F403
from .mbpp import * # noqa: F401, F403 from .mbpp import * # noqa: F401, F403
from .mmlu import * # noqa: F401, F403 from .mmlu import * # noqa: F401, F403
......
from .evaluators import LongBenchClassificationEvaluator # noqa: F401, F403
from .evaluators import LongBenchCodeSimEvaluator # noqa: F401, F403
from .evaluators import LongBenchCountEvaluator # noqa: F401, F403
from .evaluators import LongBenchF1Evaluator # noqa: F401, F403
from .evaluators import LongBenchRetrievalEvaluator # noqa: F401, F403
from .evaluators import LongBenchRougeEvaluator # noqa: F401, F403
from .longbench_2wikim_qa import * # noqa: F401, F403
from .longbench_dureader import * # noqa: F401, F403
from .longbench_gov_report import * # noqa: F401, F403
from .longbench_hotpot_qa import * # noqa: F401, F403
from .longbench_lcc import * # noqa: F401, F403
from .longbench_lsht import * # noqa: F401, F403
from .longbench_multifieldqa_en import * # noqa: F401, F403
from .longbench_multifieldqa_zh import * # noqa: F401, F403
from .longbench_musique import * # noqa: F401, F403
from .longbench_narrative_qa import * # noqa: F401, F403
from .longbench_nq import * # noqa: F401, F403
from .longbench_passage_count import * # noqa: F401, F403
from .longbench_passage_retrieval_en import * # noqa: F401, F403
from .longbench_passage_retrieval_zh import * # noqa: F401, F403
from .longbench_qasper import * # noqa: F401, F403
from .longbench_qmsum import * # noqa: F401, F403
from .longbench_repobench import * # noqa: F401, F403
from .longbench_trec import * # noqa: F401, F403
from .longbench_trivia_qa import * # noqa: F401, F403
from .longbench_vcsum import * # noqa: F401, F403
import difflib
import re
import string
from collections import Counter
from typing import List
import jieba
from fuzzywuzzy import fuzz
from rouge import Rouge
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def normalize_zh_answer(s):
"""Lower text and remove punctuation, extra whitespace."""
def white_space_fix(text):
return ''.join(text.split())
def remove_punc(text):
cn_punctuation = '!?。。"#$%&'()*+,-/:;<=>@[\]^_`\
{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.'
all_punctuation = set(string.punctuation + cn_punctuation)
return ''.join(ch for ch in text if ch not in all_punctuation)
def lower(text):
return text.lower()
return white_space_fix(remove_punc(lower(s)))
@ICL_EVALUATORS.register_module()
class LongBenchF1Evaluator(BaseEvaluator):
def __init__(self, language: str = 'en') -> None:
super().__init__()
assert language in ['en', 'zh']
self.language = language
def score(self, predictions: List, references: List) -> dict:
def f1_score(prediction, reference, **kwargs):
common = Counter(prediction) & Counter(reference)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction)
recall = 1.0 * num_same / len(reference)
f1 = (2 * precision * recall) / (precision + recall)
return f1
score = 0.
for i in range(len(predictions)):
prediction = predictions[i]
reference_list = references[i]
task_score = 0.
for reference in reference_list:
if self.language == 'en':
normalized_prediction = normalize_answer(prediction)
normalized_reference = normalize_answer(reference)
prediction_tokens = normalized_prediction.split()
reference_tokens = normalized_reference.split()
else:
prediction_tokens = list(
jieba.cut(prediction, cut_all=False))
reference_tokens = list(jieba.cut(reference,
cut_all=False))
prediction_tokens = [
normalize_zh_answer(token)
for token in prediction_tokens
]
reference_tokens = [
normalize_zh_answer(token)
for token in reference_tokens
]
prediction_tokens = [
token for token in prediction_tokens if len(token) > 0
]
reference_tokens = [
token for token in reference_tokens if len(token) > 0
]
task_score = max(task_score,
f1_score(prediction_tokens, reference_tokens))
score += task_score
score = score / len(predictions) * 100
return {'score': score}
@ICL_EVALUATORS.register_module()
class LongBenchCountEvaluator(BaseEvaluator):
def score(self, predictions: List, references: List) -> dict:
score = 0.
for i in range(len(predictions)):
prediction = predictions[i]
reference_list = references[i]
for reference in reference_list:
numbers = re.findall(r'\d+', prediction)
right_num = 0
for number in numbers:
if str(number) == str(reference):
right_num += 1
score += 0.0 if len(numbers) == 0 else float(right_num /
len(numbers))
score = score / len(predictions) * 100
return {'score': score}
@ICL_EVALUATORS.register_module()
class LongBenchRetrievalEvaluator(BaseEvaluator):
def __init__(self, language: str = 'en') -> None:
super().__init__()
assert language in ['en', 'zh']
self.language = language
def score(self, predictions: List, references: List) -> dict:
score = 0.
for i in range(len(predictions)):
prediction = predictions[i]
reference_list = references[i]
for reference in reference_list:
if self.language == 'en':
pattern = r'Paragraph (\d+)'
else:
pattern = r'段落(\d+)'
matches = re.findall(pattern, reference)
reference_id = matches[0]
numbers = re.findall(r'\d+', prediction)
right_num = 0
for number in numbers:
if str(number) == str(reference_id):
right_num += 1
score += 0.0 if len(numbers) == 0 else float(right_num /
len(numbers))
score = score / len(predictions) * 100
return {'score': score}
@ICL_EVALUATORS.register_module()
class LongBenchRougeEvaluator(BaseEvaluator):
def __init__(self, language: str = 'en') -> None:
super().__init__()
assert language in ['en', 'zh']
self.language = language
def score(self, predictions: List, references: List) -> dict:
score = 0.
for i in range(len(predictions)):
prediction = predictions[i]
reference_list = references[i]
task_score = 0.
for reference in reference_list:
if self.language == 'zh':
prediction = ' '.join(
list(jieba.cut(prediction, cut_all=False)))
reference = ' '.join(
list(jieba.cut(reference, cut_all=False)))
rouge = Rouge()
if prediction != '':
cur_score = rouge.get_scores([prediction], [reference],
avg=True)['rouge-l']['f']
else:
cur_score = 0.
task_score = max(task_score, cur_score)
score += task_score
score = score / len(predictions) * 100
return {'score': score}
@ICL_EVALUATORS.register_module()
class LongBenchCodeSimEvaluator(BaseEvaluator):
def score(self, predictions: List, references: List) -> dict:
score = 0.
for i in range(len(predictions)):
prediction = predictions[i]
reference_list = references[i]
task_score = 0.
for reference in reference_list:
all_lines = prediction.lstrip('\n').split('\n')
prediction = ''
for line in all_lines:
if ('`' not in line) and ('#'
not in line) and ('//'
not in line):
prediction = line
break
task_score = max(task_score,
(fuzz.ratio(prediction, reference) / 100))
score += task_score
score = score / len(predictions) * 100
return {'score': score}
@ICL_EVALUATORS.register_module()
class LongBenchClassificationEvaluator(BaseEvaluator):
def score(self, predictions: List, references: List) -> dict:
score = 0.
for i in range(len(predictions)):
prediction = predictions[i]
reference_list = references[i]['answers']
for reference in reference_list:
em_match_list = []
all_classes = references[i]['all_classes']
for class_name in all_classes:
if class_name in prediction:
em_match_list.append(class_name)
for match_term in em_match_list:
if match_term in reference and match_term != reference:
em_match_list.remove(match_term)
if em_match_list != 0:
if reference in em_match_list:
score += (1.0 / len(em_match_list))
else:
best_match = None
highest_similarity = 0
for names in all_classes:
similarity = difflib.SequenceMatcher(
None, names, prediction).ratio()
if similarity > highest_similarity:
highest_similarity = similarity
best_match = names
score += float(best_match == reference)
score = score / len(predictions) * 100
return {'score': score}
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBench2wikimqaDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchdureaderDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchgov_reportDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({'context': context, 'answers': answers})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchhotpotqaDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchlccDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({'context': context, 'answers': answers})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchlshtDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
all_classes = dataset[split]['all_classes'][i]
raw_data.append({
'input': question,
'context': context,
'all_labels': {
'answers': answers,
'all_classes': all_classes
}
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchmultifieldqa_enDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchmultifieldqa_zhDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchmusiqueDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchnarrativeqaDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchnqDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchpassage_countDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({'context': context, 'answers': answers})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchpassage_retrieval_enDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchpassage_retrieval_zhDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchqasperDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchqmsumDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment