Unverified Commit bbec7d87 authored by yuantao2108's avatar yuantao2108 Committed by GitHub
Browse files

[Feature] add lveval benchmark (#914)



* add lveval benchmark

* add LVEval readme file

* update LVEval readme file

* Update configs/eval_bluelm_32k_lveval.py

* Update configs/eval_llama2_7b_lveval.py

---------
Co-authored-by: default avataryuantao <yuantao@infini-ai.com>
Co-authored-by: default avatarMo Li <82895469+DseidLi@users.noreply.github.com>
parent 8142f399
from mmengine.config import read_base
with read_base():
from .lveval_multifieldqa_en_mixup_gen_d7ea36 import (
LVEval_multifieldqa_en_mixup_datasets,
) # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
LVEvalOPTF1Evaluator,
LVEvalmultifieldqaenDataset,
)
LVEval_multifieldqa_en_mixup_reader_cfg = dict(
input_columns=["context", "input"],
output_column="answers",
train_split="test",
test_split="test",
)
LVEval_multifieldqa_en_mixup_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt="Please answer the following question based on the given passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nArticle: {context}\n\nPlease answer the following question based on the above passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nQuestion: {input}\nAnswer:",
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=64),
)
LVEval_multifieldqa_en_mixup_eval_cfg = dict(
evaluator=dict(type=LVEvalOPTF1Evaluator, language="en"), pred_role="BOT"
)
DATASET_LENGTH_LEVEL = ["16k", "32k", "64k", "128k", "256k"]
def get_dataset_names(dataset_name, length_levels):
datasets = []
for length in length_levels:
datasets.append(f"{dataset_name}_{length}")
return datasets
LVEval_multifieldqa_en_mixup_datasets = [
dict(
type=LVEvalmultifieldqaenDataset,
abbr="LVEval_" + name_len,
path="Infinigence/LVEval",
name=name_len,
reader_cfg=LVEval_multifieldqa_en_mixup_reader_cfg,
infer_cfg=LVEval_multifieldqa_en_mixup_infer_cfg,
eval_cfg=LVEval_multifieldqa_en_mixup_eval_cfg,
)
for name_len in get_dataset_names(
"multifieldqa_en_mixup", DATASET_LENGTH_LEVEL
)
]
from mmengine.config import read_base
with read_base():
from .lveval_multifieldqa_zh_mixup_gen_0fbdad import (
LVEval_multifieldqa_zh_mixup_datasets,
) # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
LVEvalOPTF1Evaluator,
LVEvalmultifieldqazhDataset,
)
LVEval_multifieldqa_zh_mixup_reader_cfg = dict(
input_columns=["context", "input"],
output_column="answers",
train_split="test",
test_split="test",
)
LVEval_multifieldqa_zh_mixup_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt="请阅读以下文章并用中文回答问题,问题和答案只与其中一篇文章有关。只需要直接给出问题的答案,不要输出其他任何解释和证据。\n\n文章:{context}\n\n请基于上面的文章回答下面的问题,问题和答案只与其中一篇文章有关。只需要直接给出问题的答案,不要输出其他任何解释和证据。\n\n问题:{input}\n回答:",
),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=64),
)
LVEval_multifieldqa_zh_mixup_eval_cfg = dict(
evaluator=dict(type=LVEvalOPTF1Evaluator, language="zh"), pred_role="BOT"
)
DATASET_LENGTH_LEVEL = ["16k", "32k", "64k", "128k", "256k"]
def get_dataset_names(dataset_name, length_levels):
datasets = []
for length in length_levels:
datasets.append(f"{dataset_name}_{length}")
return datasets
LVEval_multifieldqa_zh_mixup_datasets = [
dict(
type=LVEvalmultifieldqazhDataset,
abbr="LVEval_" + name_len,
path="Infinigence/LVEval",
name=name_len,
reader_cfg=LVEval_multifieldqa_zh_mixup_reader_cfg,
infer_cfg=LVEval_multifieldqa_zh_mixup_infer_cfg,
eval_cfg=LVEval_multifieldqa_zh_mixup_eval_cfg,
)
for name_len in get_dataset_names(
"multifieldqa_zh_mixup", DATASET_LENGTH_LEVEL
)
]
from mmengine.config import read_base
with read_base():
from .datasets.lveval.lveval import LVEval_datasets as datasets
from .models.bluelm.hf_bluelm_7b_chat_32k import models
from .summarizers.lveval import summarizer
models[0][
"path"
] = "/path/to/your/huggingface_models/BlueLM-7B-Chat-32K"
models[0][
"tokenizer_path"
] = "/path/to/your/huggingface_models/BlueLM-7B-Chat-32K"
models[0]["max_seq_len"] = 32768
models[0]["generation_kwargs"] = dict(do_sample=False)
models[0]["mode"] = "mid" # truncate in the middle
from mmengine.config import read_base
with read_base():
from .datasets.lveval.lveval import LVEval_datasets as datasets
from .models.hf_llama.hf_llama2_7b_chat import models
from .summarizers.lveval import summarizer
models[0][
"path"
] = "/path/to/your/huggingface_models/Llama-2-7b-chat-hf"
models[0][
"tokenizer_path"
] = "/path/to/your/huggingface_models/Llama-2-7b-chat-hf"
models[0]["max_seq_len"] = 4096
models[0]["generation_kwargs"] = dict(do_sample=False)
models[0]["mode"] = "mid" # truncate in the middle
len_levels = ["16k", "32k", "64k", "128k", "256k"]
subsets_lveval_loogle_SD_mixup = [
"LVEval_loogle_SD_mixup" + "_" + len_level for len_level in len_levels
]
subsets_lveval_cmrc_mixup = [
"LVEval_cmrc_mixup" + "_" + len_level for len_level in len_levels
]
subsets_lveval_multifieldqa_en_mixup = [
"LVEval_multifieldqa_en_mixup" + "_" + len_level
for len_level in len_levels
]
subsets_lveval_multifieldqa_zh_mixup = [
"LVEval_multifieldqa_zh_mixup" + "_" + len_level
for len_level in len_levels
]
subsets_lveval_dureader_mixup = [
"LVEval_dureader_mixup" + "_" + len_level for len_level in len_levels
]
subsets_lveval_loogle_CR_mixup = [
"LVEval_loogle_CR_mixup" + "_" + len_level for len_level in len_levels
]
subsets_lveval_loogle_MIR_mixup = [
"LVEval_loogle_MIR_mixup" + "_" + len_level for len_level in len_levels
]
subsets_lveval_hotpotwikiqa_mixup = [
"LVEval_hotpotwikiqa_mixup" + "_" + len_level for len_level in len_levels
]
subsets_lveval_lic_mixup = [
"LVEval_lic_mixup" + "_" + len_level for len_level in len_levels
]
subsets_lveval_factrecall_en = [
"LVEval_factrecall_en" + "_" + len_level for len_level in len_levels
]
subsets_lveval_factrecall_zh = [
"LVEval_factrecall_zh" + "_" + len_level for len_level in len_levels
]
subsets_lveval_single_hop_qa = (
subsets_lveval_loogle_SD_mixup + subsets_lveval_cmrc_mixup
)
subsets_lveval_single_hop_cqa = (
subsets_lveval_multifieldqa_en_mixup + subsets_lveval_multifieldqa_zh_mixup
)
subsets_lveval_multi_hop_qa = (
subsets_lveval_dureader_mixup
+ subsets_lveval_loogle_CR_mixup
+ subsets_lveval_loogle_MIR_mixup
)
subsets_lveval_multi_hop_cqa = (
subsets_lveval_hotpotwikiqa_mixup + subsets_lveval_lic_mixup
)
subsets_lveval_factrecall_cqa = (
subsets_lveval_factrecall_en + subsets_lveval_factrecall_zh
)
subsets_lveval_qa = (
subsets_lveval_single_hop_qa
+ subsets_lveval_single_hop_cqa
+ subsets_lveval_multi_hop_qa
+ subsets_lveval_multi_hop_cqa
+ subsets_lveval_factrecall_cqa
)
lveval_summary_groups = [
{
"name": "LVEval_loogle_SD_mixup",
"subsets": subsets_lveval_loogle_SD_mixup,
},
{"name": "LVEval_cmrc_mixup", "subsets": subsets_lveval_cmrc_mixup},
{
"name": "LVEval_multifieldqa_en_mixup",
"subsets": subsets_lveval_multifieldqa_en_mixup,
},
{
"name": "LVEval_multifieldqa_zh_mixup",
"subsets": subsets_lveval_multifieldqa_zh_mixup,
},
{
"name": "LVEval_dureader_mixup",
"subsets": subsets_lveval_dureader_mixup,
},
{
"name": "LVEval_loogle_CR_mixup",
"subsets": subsets_lveval_loogle_CR_mixup,
},
{
"name": "LVEval_loogle_MIR_mixup",
"subsets": subsets_lveval_loogle_MIR_mixup,
},
{
"name": "LVEval_hotpotwikiqa_mixup",
"subsets": subsets_lveval_hotpotwikiqa_mixup,
},
{"name": "LVEval_lic_mixup", "subsets": subsets_lveval_lic_mixup},
{"name": "LVEval_factrecall_en", "subsets": subsets_lveval_factrecall_en},
{"name": "LVEval_factrecall_zh", "subsets": subsets_lveval_factrecall_zh},
{"name": "LVEval_single_hop_qa", "subsets": subsets_lveval_single_hop_qa},
{
"name": "LVEval_single_hop_cqa",
"subsets": subsets_lveval_single_hop_cqa,
},
{"name": "LVEval_multi_hop_qa", "subsets": subsets_lveval_multi_hop_qa},
{"name": "LVEval_multi_hop_cqa", "subsets": subsets_lveval_multi_hop_cqa},
{
"name": "LVEval_factrecall_cqa",
"subsets": subsets_lveval_factrecall_cqa,
},
{"name": "LVEval_qa", "subsets": subsets_lveval_qa},
]
from mmengine.config import read_base
with read_base():
from .groups.lveval import lveval_summary_groups
summarizer = dict(
dataset_abbrs=[
"----------------------------------------",
"--------- LVEval All ---------", # category
"----------------------------------------",
"LVEval_qa",
"----------------------------------------",
"--------- LVEval Tasks All ---------", # category
"----------------------------------------",
"LVEval_single_hop_qa",
"LVEval_single_hop_cqa",
"LVEval_multi_hop_qa",
"LVEval_multi_hop_cqa",
"LVEval_factrecall_cqa",
"----------------------------------------",
"--------- LVEval Datasets All ---------", # category
"----------------------------------------",
"LVEval_loogle_SD_mixup",
"LVEval_cmrc_mixup",
"LVEval_multifieldqa_en_mixup",
"LVEval_multifieldqa_zh_mixup",
"LVEval_dureader_mixup",
"LVEval_loogle_CR_mixup",
"LVEval_loogle_MIR_mixup",
"LVEval_hotpotwikiqa_mixup",
"LVEval_lic_mixup",
"LVEval_factrecall_en",
"LVEval_factrecall_zh",
"----------------------------------------",
"--------- LVEval Single_Hop QA ---------", # category
"----------------------------------------",
"LVEval_loogle_SD_mixup_16k",
"LVEval_loogle_SD_mixup_32k",
"LVEval_loogle_SD_mixup_64k",
"LVEval_loogle_SD_mixup_128k",
"LVEval_loogle_SD_mixup_256k",
"----------------------------------------",
"LVEval_cmrc_mixup_16k",
"LVEval_cmrc_mixup_32k",
"LVEval_cmrc_mixup_64k",
"LVEval_cmrc_mixup_128k",
"LVEval_cmrc_mixup_256k",
"----------------------------------------",
"--------- LVEval Single_Hop CQA ---------", # category
"----------------------------------------",
"LVEval_multifieldqa_en_mixup_16k",
"LVEval_multifieldqa_en_mixup_32k",
"LVEval_multifieldqa_en_mixup_64k",
"LVEval_multifieldqa_en_mixup_128k",
"LVEval_multifieldqa_en_mixup_256k",
"----------------------------------------",
"LVEval_multifieldqa_zh_mixup_16k",
"LVEval_multifieldqa_zh_mixup_32k",
"LVEval_multifieldqa_zh_mixup_64k",
"LVEval_multifieldqa_zh_mixup_128k",
"LVEval_multifieldqa_zh_mixup_256k",
"----------------------------------------",
"--------- LVEval Multi_Hop QA ---------", # category
"----------------------------------------",
"LVEval_dureader_mixup_16k",
"LVEval_dureader_mixup_32k",
"LVEval_dureader_mixup_64k",
"LVEval_dureader_mixup_128k",
"LVEval_dureader_mixup_256k",
"----------------------------------------",
"LVEval_loogle_CR_mixup_16k",
"LVEval_loogle_CR_mixup_32k",
"LVEval_loogle_CR_mixup_64k",
"LVEval_loogle_CR_mixup_128k",
"LVEval_loogle_CR_mixup_256k",
"----------------------------------------",
"LVEval_loogle_MIR_mixup_16k",
"LVEval_loogle_MIR_mixup_32k",
"LVEval_loogle_MIR_mixup_64k",
"LVEval_loogle_MIR_mixup_128k",
"LVEval_loogle_MIR_mixup_256k",
"----------------------------------------",
"--------- LVEval Multi_Hop CQA ---------", # category
"----------------------------------------",
"LVEval_hotpotwikiqa_mixup_16k",
"LVEval_hotpotwikiqa_mixup_32k",
"LVEval_hotpotwikiqa_mixup_64k",
"LVEval_hotpotwikiqa_mixup_128k",
"LVEval_hotpotwikiqa_mixup_256k",
"----------------------------------------",
"LVEval_lic_mixup_16k",
"LVEval_lic_mixup_32k",
"LVEval_lic_mixup_64k",
"LVEval_lic_mixup_128k",
"LVEval_lic_mixup_256k",
"----------------------------------------",
"--------- LVEval Factrecall CQA ---------", # category
"----------------------------------------",
"LVEval_factrecall_en_16k",
"LVEval_factrecall_en_32k",
"LVEval_factrecall_en_64k",
"LVEval_factrecall_en_128k",
"LVEval_factrecall_en_256k",
"----------------------------------------",
"LVEval_factrecall_zh_16k",
"LVEval_factrecall_zh_32k",
"LVEval_factrecall_zh_64k",
"LVEval_factrecall_zh_128k",
"LVEval_factrecall_zh_256k",
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []
),
)
......@@ -58,6 +58,7 @@ from .lawbench import * # noqa: F401, F403
from .lcsts import * # noqa: F401, F403
from .leval import * # noqa: F401, F403
from .longbench import * # noqa: F401, F403
from .lveval import * # noqa: F401, F403
from .mastermath2024v1 import * # noqa: F401, F403
from .math import * # noqa: F401, F403
from .math401 import * # noqa: F401, F403
......
from .evaluators import LVEvalF1Evaluator # noqa: F401, F403
from .evaluators import LVEvalOPTF1Evaluator # noqa: F401, F403
from .evaluators import LVEvalOPTRougeEvaluator # noqa: F401, F403
from .lveval_cmrc_mixup import * # noqa: F401, F403
from .lveval_dureader_mixup import * # noqa: F401, F403
from .lveval_factrecall_en import * # noqa: F401, F403
from .lveval_factrecall_zh import * # noqa: F401, F403
from .lveval_hotpotwikiqa_mixup import * # noqa: F401, F403
from .lveval_lic_mixup import * # noqa: F401, F403
from .lveval_loogle_CR_mixup import * # noqa: F401, F403
from .lveval_loogle_MIR_mixup import * # noqa: F401, F403
from .lveval_loogle_SD_mixup import * # noqa: F401, F403
from .lveval_multifieldqa_en_mixup import * # noqa: F401, F403
from .lveval_multifieldqa_zh_mixup import * # noqa: F401, F403
"""Functions for computing metrics.
Part of following code are modified from ` https://github.com/THUDM/LongBench`
"""
import re
import string
from collections import Counter
from typing import List
import jieba
from rouge import Rouge
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS
ABANDON_WORDS_EN = [
'and',
'to',
'of',
'in',
'her',
'was',
'with',
'for',
'it',
'from',
'is',
'that',
'his',
'he',
'by',
'she',
'they',
'or',
'at',
'because',
'be',
'on',
'are',
'their',
'what',
'as',
'had',
'were',
'about',
'being',
'this',
'who',
'but',
'have',
'has',
'when',
'which',
'does',
]
ABANDON_WORDS_ZH = [
'的',
'和',
'是',
'等',
'在',
'年',
'可以',
'为',
'与',
'‰',
'了',
'或',
'一种',
'月',
'c',
'至',
'日',
'有',
'进行',
'于',
'不',
'中',
'×',
'根据',
'小',
'由',
'亩',
'也',
'要',
'指',
'法',
'会',
'元',
'主要',
'以及',
'通过',
'首先',
'对',
'然后',
'号',
'以',
'所',
'后',
'丁',
'包括',
'无',
'将',
'用',
'能',
'形',
'方面',
'因素',
'位于',
'而',
'从',
'到',
'一定',
'用于',
'但',
'使用',
'让',
'具有',
'并',
'亿元',
'万元',
'上',
'类',
'基于',
'才',
'来',
'地',
'片',
'其他',
'个',
'或者',
'变得',
'时',
'给',
'你',
'使',
'条',
'受',
'已经',
'带',
'度',
]
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def normalize_zh_answer(s):
"""Lower text and remove punctuation, extra whitespace."""
def white_space_fix(text):
return ''.join(text.split())
def remove_punc(text):
cn_punctuation = '!?。。"#$%&'()*+,-/:;<=>@[\]^_`\
{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.'
all_punctuation = set(string.punctuation + cn_punctuation)
return ''.join(ch for ch in text if ch not in all_punctuation)
def lower(text):
return text.lower()
return white_space_fix(remove_punc(lower(s)))
@ICL_EVALUATORS.register_module()
class LVEvalF1Evaluator(BaseEvaluator):
def __init__(self, language: str = 'en') -> None:
super().__init__()
assert language in ['en', 'zh']
self.language = language
def score(self, predictions: List, references: List) -> dict:
def f1_score(prediction, reference, **kwargs):
common = Counter(prediction) & Counter(reference)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction)
recall = 1.0 * num_same / len(reference)
f1 = (2 * precision * recall) / (precision + recall)
return f1
score = 0.0
for i in range(len(predictions)):
prediction = predictions[i]
reference_list = references[i]
task_score = 0.0
for reference in reference_list:
if self.language == 'en':
normalized_prediction = normalize_answer(prediction)
normalized_reference = normalize_answer(reference)
prediction_tokens = normalized_prediction.split()
reference_tokens = normalized_reference.split()
else:
prediction_tokens = list(
jieba.cut(prediction, cut_all=False))
reference_tokens = list(jieba.cut(reference,
cut_all=False))
prediction_tokens = [
normalize_zh_answer(token)
for token in prediction_tokens
]
reference_tokens = [
normalize_zh_answer(token)
for token in reference_tokens
]
prediction_tokens = [
token for token in prediction_tokens if len(token) > 0
]
reference_tokens = [
token for token in reference_tokens if len(token) > 0
]
task_score = max(task_score,
f1_score(prediction_tokens, reference_tokens))
break
score += task_score
score = score / len(predictions) * 100
return {'f1': score}
@ICL_EVALUATORS.register_module()
class LVEvalOPTF1Evaluator(BaseEvaluator):
def __init__(self, language: str = 'en') -> None:
super().__init__()
assert language in ['en', 'zh']
self.language = language
def score(self, predictions: List, references: List) -> dict:
def f1_score(prediction, reference, **kwargs):
common = Counter(prediction) & Counter(reference)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction)
recall = 1.0 * num_same / len(reference)
f1 = (2 * precision * recall) / (precision + recall)
return f1
score = 0.0
for i in range(len(predictions)):
prediction = predictions[i]
reference_list = references[i]
answer_keyword = reference_list[-1]
task_score = 0.0
for reference in reference_list:
if self.language == 'en':
normalized_prediction = normalize_answer(prediction)
normalized_reference = normalize_answer(reference)
prediction_tokens = normalized_prediction.split()
reference_tokens = normalized_reference.split()
# answer keywords recall
if answer_keyword:
answer_keyword_tokens = normalize_answer(
answer_keyword)
answer_keyword_tokens = answer_keyword_tokens.split()
common = Counter(prediction_tokens) & Counter(
answer_keyword_tokens)
filtered_common = {
key: value
for key, value in common.items()
if key not in ABANDON_WORDS_EN
}
num_same = sum(filtered_common.values())
recall = 1.0 * num_same / len(answer_keyword_tokens)
if recall < 0.2:
break
else:
prediction_tokens = list(
jieba.cut(prediction, cut_all=False))
reference_tokens = list(jieba.cut(reference,
cut_all=False))
prediction_tokens = [
normalize_zh_answer(token)
for token in prediction_tokens
]
reference_tokens = [
normalize_zh_answer(token)
for token in reference_tokens
]
prediction_tokens = [
token for token in prediction_tokens if len(token) > 0
]
reference_tokens = [
token for token in reference_tokens if len(token) > 0
]
if not answer_keyword:
answer_keyword = reference
if answer_keyword:
answer_keyword_tokens = list(
jieba.cut(answer_keyword, cut_all=False))
answer_keyword_tokens = [
normalize_zh_answer(token)
for token in answer_keyword_tokens
]
answer_keyword_tokens = [
token for token in answer_keyword_tokens
if len(token) > 0
]
common = Counter(prediction_tokens) & Counter(
answer_keyword_tokens)
filtered_common = {
key: value
for key, value in common.items()
if key not in ABANDON_WORDS_ZH
}
num_same = sum(filtered_common.values())
recall = 1.0 * num_same / len(answer_keyword_tokens)
if recall < 0.4:
break
task_score = max(task_score,
f1_score(prediction_tokens, reference_tokens))
break
score += task_score
score = score / len(predictions) * 100
return {'LVEval_f1': score}
@ICL_EVALUATORS.register_module()
class LVEvalOPTRougeEvaluator(BaseEvaluator):
def __init__(self, language: str = 'en') -> None:
super().__init__()
assert language in ['en', 'zh']
self.language = language
def score(self, predictions: List, references: List) -> dict:
score = 0.0
for i in range(len(predictions)):
prediction = predictions[i]
reference_list = references[i]
task_score = 0.0
for reference in reference_list:
if self.language == 'zh':
word_blacklist = ABANDON_WORDS_ZH
prediction_tokens = list(
jieba.cut(prediction, cut_all=False))
reference_tokens = list(jieba.cut(reference,
cut_all=False))
prediction_tokens = [
normalize_zh_answer(token)
for token in prediction_tokens
]
reference_tokens = [
normalize_zh_answer(token)
for token in reference_tokens
]
else:
word_blacklist = ABANDON_WORDS_EN
prediction_tokens = normalize_answer(prediction)
reference_tokens = normalize_answer(reference)
prediction_tokens = prediction_tokens.split()
reference_tokens = reference_tokens.split()
filtered_prediction_tokens = [
i for i in prediction_tokens if i not in word_blacklist
]
filtered_reference_tokens = [
i for i in reference_tokens if i not in word_blacklist
]
prediction = ' '.join(filtered_prediction_tokens)
reference = ' '.join(filtered_reference_tokens)
rouge = Rouge()
try:
cur_score = rouge.get_scores([prediction], [reference],
avg=True)['rouge-l']['f']
except Exception:
cur_score = 0.0
task_score = max(task_score, cur_score)
break
score += task_score
score = score / len(predictions) * 100
return {'LVEval_rouge': score}
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LVEvalcmrcDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
confusing_facts = dataset[split]['confusing_facts'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers,
'confusing_facts': confusing_facts,
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LVEvaldureaderDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers,
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LVEvalfactrecallenDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
confusing_facts = dataset[split]['confusing_facts'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers,
'confusing_facts': confusing_facts,
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LVEvalfactrecallzhDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
confusing_facts = dataset[split]['confusing_facts'][i]
raw_data.append({
'input': question,
'context': context,
'answers': answers,
'confusing_facts': confusing_facts,
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LVEvalhotpotwikiqaDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
confusing_facts = dataset[split]['confusing_facts'][i]
answer_keywords = dataset[split]['answer_keywords'][i]
answers_with_ak = answers + [answer_keywords]
raw_data.append({
'input': question,
'context': context,
'answers': answers_with_ak,
'confusing_facts': confusing_facts,
'answer_keywords': answer_keywords,
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LVEvallicDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
confusing_facts = dataset[split]['confusing_facts'][i]
answer_keywords = dataset[split]['answer_keywords'][i]
answers_with_ak = answers + [answer_keywords]
raw_data.append({
'input': question,
'context': context,
'answers': answers_with_ak,
'confusing_facts': confusing_facts,
'answer_keywords': answer_keywords,
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LVEvallooglecrDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
answer_keywords = dataset[split]['answer_keywords'][i]
answers_with_ak = answers + [answer_keywords]
raw_data.append({
'input': question,
'context': context,
'answers': answers_with_ak,
'answer_keywords': answer_keywords,
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LVEvallooglemirDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
answer_keywords = dataset[split]['answer_keywords'][i]
answers_with_ak = answers + [answer_keywords]
raw_data.append({
'input': question,
'context': context,
'answers': answers_with_ak,
'answer_keywords': answer_keywords,
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LVEvallooglesdDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
question = dataset[split]['input'][i]
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
answer_keywords = dataset[split]['answer_keywords'][i]
answers_with_ak = answers + [answer_keywords]
raw_data.append({
'input': question,
'context': context,
'answers': answers_with_ak,
'answer_keywords': answer_keywords,
})
dataset[split] = Dataset.from_list(raw_data)
return dataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment