[Feature] add lveval benchmark (#914)

* add lveval benchmark * add LVEval readme file * update LVEval readme file * Update configs/eval_bluelm_32k_lveval.py * Update configs/eval_llama2_7b_lveval.py --------- Co-authored-by: yuantao <yuantao@infini-ai.com> Co-authored-by: Mo Li <82895469+DseidLi@users.noreply.github.com>

[Feature] add lveval benchmark (#914)
* add lveval benchmark * add LVEval readme file * update LVEval readme file * Update configs/eval_bluelm_32k_lveval.py * Update configs/eval_llama2_7b_lveval.py --------- Co-authored-by: yuantao <yuantao@infini-ai.com> Co-authored-by: Mo Li <82895469+DseidLi@users.noreply.github.com>
bbec7d87 · yuantao2108 · GitHub · 8142f399 · bbec7d87 · bbec7d87
Unverified Commit bbec7d87 authored Mar 04, 2024 by yuantao2108 Committed by GitHub Mar 04, 2024
20 changed files
--- a/configs/datasets/lveval/lvevalmultifieldqa_en_mixup/lveval_multifieldqa_en_mixup_gen.py
+++ b/configs/datasets/lveval/lvevalmultifieldqa_en_mixup/lveval_multifieldqa_en_mixup_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .lveval_multifieldqa_en_mixup_gen_d7ea36 import (
+        LVEval_multifieldqa_en_mixup_datasets,
+    )  # noqa: F401, F403
--- a/configs/datasets/lveval/lvevalmultifieldqa_en_mixup/lveval_multifieldqa_en_mixup_gen_d7ea36.py
+++ b/configs/datasets/lveval/lvevalmultifieldqa_en_mixup/lveval_multifieldqa_en_mixup_gen_d7ea36.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LVEvalOPTF1Evaluator,
+    LVEvalmultifieldqaenDataset,
+)
+
+LVEval_multifieldqa_en_mixup_reader_cfg = dict(
+    input_columns=["context", "input"],
+    output_column="answers",
+    train_split="test",
+    test_split="test",
+)
+
+LVEval_multifieldqa_en_mixup_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt="Please answer the following question based on the given passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nArticle: {context}\n\nPlease answer the following question based on the above passages. Questions and answers are only relevant to one passage. Only give me the answer and do not output any other explanation and evidence.\n\nQuestion: {input}\nAnswer:",
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=64),
+)
+
+LVEval_multifieldqa_en_mixup_eval_cfg = dict(
+    evaluator=dict(type=LVEvalOPTF1Evaluator, language="en"), pred_role="BOT"
+)
+
+DATASET_LENGTH_LEVEL = ["16k", "32k", "64k", "128k", "256k"]
+
+
+def get_dataset_names(dataset_name, length_levels):
+    datasets = []
+    for length in length_levels:
+        datasets.append(f"{dataset_name}_{length}")
+    return datasets
+
+
+LVEval_multifieldqa_en_mixup_datasets = [
+    dict(
+        type=LVEvalmultifieldqaenDataset,
+        abbr="LVEval_" + name_len,
+        path="Infinigence/LVEval",
+        name=name_len,
+        reader_cfg=LVEval_multifieldqa_en_mixup_reader_cfg,
+        infer_cfg=LVEval_multifieldqa_en_mixup_infer_cfg,
+        eval_cfg=LVEval_multifieldqa_en_mixup_eval_cfg,
+    )
+    for name_len in get_dataset_names(
+        "multifieldqa_en_mixup", DATASET_LENGTH_LEVEL
+    )
+]
--- a/configs/datasets/lveval/lvevalmultifieldqa_zh_mixup/lveval_multifieldqa_zh_mixup_gen.py
+++ b/configs/datasets/lveval/lvevalmultifieldqa_zh_mixup/lveval_multifieldqa_zh_mixup_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .lveval_multifieldqa_zh_mixup_gen_0fbdad import (
+        LVEval_multifieldqa_zh_mixup_datasets,
+    )  # noqa: F401, F403
--- a/configs/datasets/lveval/lvevalmultifieldqa_zh_mixup/lveval_multifieldqa_zh_mixup_gen_0fbdad.py
+++ b/configs/datasets/lveval/lvevalmultifieldqa_zh_mixup/lveval_multifieldqa_zh_mixup_gen_0fbdad.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LVEvalOPTF1Evaluator,
+    LVEvalmultifieldqazhDataset,
+)
+
+LVEval_multifieldqa_zh_mixup_reader_cfg = dict(
+    input_columns=["context", "input"],
+    output_column="answers",
+    train_split="test",
+    test_split="test",
+)
+
+LVEval_multifieldqa_zh_mixup_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt="请阅读以下文章并用中文回答问题，问题和答案只与其中一篇文章有关。只需要直接给出问题的答案，不要输出其他任何解释和证据。\n\n文章：{context}\n\n请基于上面的文章回答下面的问题，问题和答案只与其中一篇文章有关。只需要直接给出问题的答案，不要输出其他任何解释和证据。\n\n问题：{input}\n回答：",
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=64),
+)
+
+LVEval_multifieldqa_zh_mixup_eval_cfg = dict(
+    evaluator=dict(type=LVEvalOPTF1Evaluator, language="zh"), pred_role="BOT"
+)
+
+DATASET_LENGTH_LEVEL = ["16k", "32k", "64k", "128k", "256k"]
+
+
+def get_dataset_names(dataset_name, length_levels):
+    datasets = []
+    for length in length_levels:
+        datasets.append(f"{dataset_name}_{length}")
+    return datasets
+
+
+LVEval_multifieldqa_zh_mixup_datasets = [
+    dict(
+        type=LVEvalmultifieldqazhDataset,
+        abbr="LVEval_" + name_len,
+        path="Infinigence/LVEval",
+        name=name_len,
+        reader_cfg=LVEval_multifieldqa_zh_mixup_reader_cfg,
+        infer_cfg=LVEval_multifieldqa_zh_mixup_infer_cfg,
+        eval_cfg=LVEval_multifieldqa_zh_mixup_eval_cfg,
+    )
+    for name_len in get_dataset_names(
+        "multifieldqa_zh_mixup", DATASET_LENGTH_LEVEL
+    )
+]
--- a/configs/eval_bluelm_32k_lveval.py
+++ b/configs/eval_bluelm_32k_lveval.py
+from mmengine.config import read_base
+
+with read_base():
+    from .datasets.lveval.lveval import LVEval_datasets as datasets
+    from .models.bluelm.hf_bluelm_7b_chat_32k import models
+    from .summarizers.lveval import summarizer
+
+models[0][
+    "path"
+] = "/path/to/your/huggingface_models/BlueLM-7B-Chat-32K"
+models[0][
+    "tokenizer_path"
+] = "/path/to/your/huggingface_models/BlueLM-7B-Chat-32K"
+models[0]["max_seq_len"] = 32768
+models[0]["generation_kwargs"] = dict(do_sample=False)
+models[0]["mode"] = "mid"  # truncate in the middle
--- a/configs/eval_llama2_7b_lveval.py
+++ b/configs/eval_llama2_7b_lveval.py
+from mmengine.config import read_base
+
+with read_base():
+    from .datasets.lveval.lveval import LVEval_datasets as datasets
+    from .models.hf_llama.hf_llama2_7b_chat import models
+    from .summarizers.lveval import summarizer
+
+models[0][
+    "path"
+] = "/path/to/your/huggingface_models/Llama-2-7b-chat-hf"
+models[0][
+    "tokenizer_path"
+] = "/path/to/your/huggingface_models/Llama-2-7b-chat-hf"
+models[0]["max_seq_len"] = 4096
+models[0]["generation_kwargs"] = dict(do_sample=False)
+models[0]["mode"] = "mid"  # truncate in the middle
--- a/configs/summarizers/groups/lveval.py
+++ b/configs/summarizers/groups/lveval.py
+len_levels = ["16k", "32k", "64k", "128k", "256k"]
+
+subsets_lveval_loogle_SD_mixup = [
+    "LVEval_loogle_SD_mixup" + "_" + len_level for len_level in len_levels
+]
+subsets_lveval_cmrc_mixup = [
+    "LVEval_cmrc_mixup" + "_" + len_level for len_level in len_levels
+]
+subsets_lveval_multifieldqa_en_mixup = [
+    "LVEval_multifieldqa_en_mixup" + "_" + len_level
+    for len_level in len_levels
+]
+subsets_lveval_multifieldqa_zh_mixup = [
+    "LVEval_multifieldqa_zh_mixup" + "_" + len_level
+    for len_level in len_levels
+]
+subsets_lveval_dureader_mixup = [
+    "LVEval_dureader_mixup" + "_" + len_level for len_level in len_levels
+]
+subsets_lveval_loogle_CR_mixup = [
+    "LVEval_loogle_CR_mixup" + "_" + len_level for len_level in len_levels
+]
+subsets_lveval_loogle_MIR_mixup = [
+    "LVEval_loogle_MIR_mixup" + "_" + len_level for len_level in len_levels
+]
+subsets_lveval_hotpotwikiqa_mixup = [
+    "LVEval_hotpotwikiqa_mixup" + "_" + len_level for len_level in len_levels
+]
+subsets_lveval_lic_mixup = [
+    "LVEval_lic_mixup" + "_" + len_level for len_level in len_levels
+]
+subsets_lveval_factrecall_en = [
+    "LVEval_factrecall_en" + "_" + len_level for len_level in len_levels
+]
+subsets_lveval_factrecall_zh = [
+    "LVEval_factrecall_zh" + "_" + len_level for len_level in len_levels
+]
+
+subsets_lveval_single_hop_qa = (
+    subsets_lveval_loogle_SD_mixup + subsets_lveval_cmrc_mixup
+)
+subsets_lveval_single_hop_cqa = (
+    subsets_lveval_multifieldqa_en_mixup + subsets_lveval_multifieldqa_zh_mixup
+)
+subsets_lveval_multi_hop_qa = (
+    subsets_lveval_dureader_mixup
+    + subsets_lveval_loogle_CR_mixup
+    + subsets_lveval_loogle_MIR_mixup
+)
+subsets_lveval_multi_hop_cqa = (
+    subsets_lveval_hotpotwikiqa_mixup + subsets_lveval_lic_mixup
+)
+subsets_lveval_factrecall_cqa = (
+    subsets_lveval_factrecall_en + subsets_lveval_factrecall_zh
+)
+
+subsets_lveval_qa = (
+    subsets_lveval_single_hop_qa
+    + subsets_lveval_single_hop_cqa
+    + subsets_lveval_multi_hop_qa
+    + subsets_lveval_multi_hop_cqa
+    + subsets_lveval_factrecall_cqa
+)
+
+lveval_summary_groups = [
+    {
+        "name": "LVEval_loogle_SD_mixup",
+        "subsets": subsets_lveval_loogle_SD_mixup,
+    },
+    {"name": "LVEval_cmrc_mixup", "subsets": subsets_lveval_cmrc_mixup},
+    {
+        "name": "LVEval_multifieldqa_en_mixup",
+        "subsets": subsets_lveval_multifieldqa_en_mixup,
+    },
+    {
+        "name": "LVEval_multifieldqa_zh_mixup",
+        "subsets": subsets_lveval_multifieldqa_zh_mixup,
+    },
+    {
+        "name": "LVEval_dureader_mixup",
+        "subsets": subsets_lveval_dureader_mixup,
+    },
+    {
+        "name": "LVEval_loogle_CR_mixup",
+        "subsets": subsets_lveval_loogle_CR_mixup,
+    },
+    {
+        "name": "LVEval_loogle_MIR_mixup",
+        "subsets": subsets_lveval_loogle_MIR_mixup,
+    },
+    {
+        "name": "LVEval_hotpotwikiqa_mixup",
+        "subsets": subsets_lveval_hotpotwikiqa_mixup,
+    },
+    {"name": "LVEval_lic_mixup", "subsets": subsets_lveval_lic_mixup},
+    {"name": "LVEval_factrecall_en", "subsets": subsets_lveval_factrecall_en},
+    {"name": "LVEval_factrecall_zh", "subsets": subsets_lveval_factrecall_zh},
+    {"name": "LVEval_single_hop_qa", "subsets": subsets_lveval_single_hop_qa},
+    {
+        "name": "LVEval_single_hop_cqa",
+        "subsets": subsets_lveval_single_hop_cqa,
+    },
+    {"name": "LVEval_multi_hop_qa", "subsets": subsets_lveval_multi_hop_qa},
+    {"name": "LVEval_multi_hop_cqa", "subsets": subsets_lveval_multi_hop_cqa},
+    {
+        "name": "LVEval_factrecall_cqa",
+        "subsets": subsets_lveval_factrecall_cqa,
+    },
+    {"name": "LVEval_qa", "subsets": subsets_lveval_qa},
+]
--- a/configs/summarizers/lveval.py
+++ b/configs/summarizers/lveval.py
+from mmengine.config import read_base
+
+with read_base():
+    from .groups.lveval import lveval_summary_groups
+
+summarizer = dict(
+    dataset_abbrs=[
+        "----------------------------------------",
+        "--------- LVEval All ---------",  # category
+        "----------------------------------------",
+        "LVEval_qa",
+        "----------------------------------------",
+        "--------- LVEval Tasks All ---------",  # category
+        "----------------------------------------",
+        "LVEval_single_hop_qa",
+        "LVEval_single_hop_cqa",
+        "LVEval_multi_hop_qa",
+        "LVEval_multi_hop_cqa",
+        "LVEval_factrecall_cqa",
+        "----------------------------------------",
+        "--------- LVEval Datasets All ---------",  # category
+        "----------------------------------------",
+        "LVEval_loogle_SD_mixup",
+        "LVEval_cmrc_mixup",
+        "LVEval_multifieldqa_en_mixup",
+        "LVEval_multifieldqa_zh_mixup",
+        "LVEval_dureader_mixup",
+        "LVEval_loogle_CR_mixup",
+        "LVEval_loogle_MIR_mixup",
+        "LVEval_hotpotwikiqa_mixup",
+        "LVEval_lic_mixup",
+        "LVEval_factrecall_en",
+        "LVEval_factrecall_zh",
+        "----------------------------------------",
+        "--------- LVEval Single_Hop QA ---------",  # category
+        "----------------------------------------",
+        "LVEval_loogle_SD_mixup_16k",
+        "LVEval_loogle_SD_mixup_32k",
+        "LVEval_loogle_SD_mixup_64k",
+        "LVEval_loogle_SD_mixup_128k",
+        "LVEval_loogle_SD_mixup_256k",
+        "----------------------------------------",
+        "LVEval_cmrc_mixup_16k",
+        "LVEval_cmrc_mixup_32k",
+        "LVEval_cmrc_mixup_64k",
+        "LVEval_cmrc_mixup_128k",
+        "LVEval_cmrc_mixup_256k",
+        "----------------------------------------",
+        "--------- LVEval Single_Hop CQA ---------",  # category
+        "----------------------------------------",
+        "LVEval_multifieldqa_en_mixup_16k",
+        "LVEval_multifieldqa_en_mixup_32k",
+        "LVEval_multifieldqa_en_mixup_64k",
+        "LVEval_multifieldqa_en_mixup_128k",
+        "LVEval_multifieldqa_en_mixup_256k",
+        "----------------------------------------",
+        "LVEval_multifieldqa_zh_mixup_16k",
+        "LVEval_multifieldqa_zh_mixup_32k",
+        "LVEval_multifieldqa_zh_mixup_64k",
+        "LVEval_multifieldqa_zh_mixup_128k",
+        "LVEval_multifieldqa_zh_mixup_256k",
+        "----------------------------------------",
+        "--------- LVEval Multi_Hop QA ---------",  # category
+        "----------------------------------------",
+        "LVEval_dureader_mixup_16k",
+        "LVEval_dureader_mixup_32k",
+        "LVEval_dureader_mixup_64k",
+        "LVEval_dureader_mixup_128k",
+        "LVEval_dureader_mixup_256k",
+        "----------------------------------------",
+        "LVEval_loogle_CR_mixup_16k",
+        "LVEval_loogle_CR_mixup_32k",
+        "LVEval_loogle_CR_mixup_64k",
+        "LVEval_loogle_CR_mixup_128k",
+        "LVEval_loogle_CR_mixup_256k",
+        "----------------------------------------",
+        "LVEval_loogle_MIR_mixup_16k",
+        "LVEval_loogle_MIR_mixup_32k",
+        "LVEval_loogle_MIR_mixup_64k",
+        "LVEval_loogle_MIR_mixup_128k",
+        "LVEval_loogle_MIR_mixup_256k",
+        "----------------------------------------",
+        "--------- LVEval Multi_Hop CQA ---------",  # category
+        "----------------------------------------",
+        "LVEval_hotpotwikiqa_mixup_16k",
+        "LVEval_hotpotwikiqa_mixup_32k",
+        "LVEval_hotpotwikiqa_mixup_64k",
+        "LVEval_hotpotwikiqa_mixup_128k",
+        "LVEval_hotpotwikiqa_mixup_256k",
+        "----------------------------------------",
+        "LVEval_lic_mixup_16k",
+        "LVEval_lic_mixup_32k",
+        "LVEval_lic_mixup_64k",
+        "LVEval_lic_mixup_128k",
+        "LVEval_lic_mixup_256k",
+        "----------------------------------------",
+        "--------- LVEval Factrecall CQA ---------",  # category
+        "----------------------------------------",
+        "LVEval_factrecall_en_16k",
+        "LVEval_factrecall_en_32k",
+        "LVEval_factrecall_en_64k",
+        "LVEval_factrecall_en_128k",
+        "LVEval_factrecall_en_256k",
+        "----------------------------------------",
+        "LVEval_factrecall_zh_16k",
+        "LVEval_factrecall_zh_32k",
+        "LVEval_factrecall_zh_64k",
+        "LVEval_factrecall_zh_128k",
+        "LVEval_factrecall_zh_256k",
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], []
+    ),
+)
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -58,6 +58,7 @@ from .lawbench import *  # noqa: F401, F403
 from .lcsts import *  # noqa: F401, F403
 from .leval import *  # noqa: F401, F403
 from .longbench import *  # noqa: F401, F403
+from .lveval import *  # noqa: F401, F403
 from .mastermath2024v1 import *  # noqa: F401, F403
 from .math import *  # noqa: F401, F403
 from .math401 import *  # noqa: F401, F403

--- a/opencompass/datasets/lveval/__init__.py
+++ b/opencompass/datasets/lveval/__init__.py
+from .evaluators import LVEvalF1Evaluator  # noqa: F401, F403
+from .evaluators import LVEvalOPTF1Evaluator  # noqa: F401, F403
+from .evaluators import LVEvalOPTRougeEvaluator  # noqa: F401, F403
+from .lveval_cmrc_mixup import *  # noqa: F401, F403
+from .lveval_dureader_mixup import *  # noqa: F401, F403
+from .lveval_factrecall_en import *  # noqa: F401, F403
+from .lveval_factrecall_zh import *  # noqa: F401, F403
+from .lveval_hotpotwikiqa_mixup import *  # noqa: F401, F403
+from .lveval_lic_mixup import *  # noqa: F401, F403
+from .lveval_loogle_CR_mixup import *  # noqa: F401, F403
+from .lveval_loogle_MIR_mixup import *  # noqa: F401, F403
+from .lveval_loogle_SD_mixup import *  # noqa: F401, F403
+from .lveval_multifieldqa_en_mixup import *  # noqa: F401, F403
+from .lveval_multifieldqa_zh_mixup import *  # noqa: F401, F403
--- a/opencompass/datasets/lveval/evaluators.py
+++ b/opencompass/datasets/lveval/evaluators.py
+"""Functions for computing metrics.
+
+Part of following code are modified from ` https://github.com/THUDM/LongBench`
+"""
+
+import re
+import string
+from collections import Counter
+from typing import List
+
+import jieba
+from rouge import Rouge
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS
+
+ABANDON_WORDS_EN = [
+    'and',
+    'to',
+    'of',
+    'in',
+    'her',
+    'was',
+    'with',
+    'for',
+    'it',
+    'from',
+    'is',
+    'that',
+    'his',
+    'he',
+    'by',
+    'she',
+    'they',
+    'or',
+    'at',
+    'because',
+    'be',
+    'on',
+    'are',
+    'their',
+    'what',
+    'as',
+    'had',
+    'were',
+    'about',
+    'being',
+    'this',
+    'who',
+    'but',
+    'have',
+    'has',
+    'when',
+    'which',
+    'does',
+]
+
+ABANDON_WORDS_ZH = [
+    '的',
+    '和',
+    '是',
+    '等',
+    '在',
+    '年',
+    '可以',
+    '为',
+    '与',
+    '‰',
+    '了',
+    '或',
+    '一种',
+    '月',
+    'c',
+    '至',
+    '日',
+    '有',
+    '进行',
+    '于',
+    '不',
+    '中',
+    '×',
+    '根据',
+    '小',
+    '由',
+    '亩',
+    '也',
+    '要',
+    '指',
+    '法',
+    '会',
+    '元',
+    '主要',
+    '以及',
+    '通过',
+    '首先',
+    '对',
+    '然后',
+    '号',
+    '以',
+    '所',
+    '后',
+    '丁',
+    '包括',
+    '无',
+    '将',
+    '用',
+    '能',
+    '形',
+    '方面',
+    '因素',
+    '位于',
+    '而',
+    '从',
+    '到',
+    '一定',
+    '用于',
+    '但',
+    '使用',
+    '让',
+    '具有',
+    '并',
+    '亿元',
+    '万元',
+    '上',
+    '类',
+    '基于',
+    '才',
+    '来',
+    '地',
+    '片',
+    '其他',
+    '个',
+    '或者',
+    '变得',
+    '时',
+    '给',
+    '你',
+    '使',
+    '条',
+    '受',
+    '已经',
+    '带',
+    '度',
+]
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def normalize_zh_answer(s):
+    """Lower text and remove punctuation, extra whitespace."""
+
+    def white_space_fix(text):
+        return ''.join(text.split())
+
+    def remove_punc(text):
+        cn_punctuation = '！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀\
+            ｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.'
+
+        all_punctuation = set(string.punctuation + cn_punctuation)
+        return ''.join(ch for ch in text if ch not in all_punctuation)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_punc(lower(s)))
+
+
+@ICL_EVALUATORS.register_module()
+class LVEvalF1Evaluator(BaseEvaluator):
+
+    def __init__(self, language: str = 'en') -> None:
+        super().__init__()
+        assert language in ['en', 'zh']
+        self.language = language
+
+    def score(self, predictions: List, references: List) -> dict:
+
+        def f1_score(prediction, reference, **kwargs):
+            common = Counter(prediction) & Counter(reference)
+            num_same = sum(common.values())
+            if num_same == 0:
+                return 0
+            precision = 1.0 * num_same / len(prediction)
+            recall = 1.0 * num_same / len(reference)
+            f1 = (2 * precision * recall) / (precision + recall)
+            return f1
+
+        score = 0.0
+        for i in range(len(predictions)):
+            prediction = predictions[i]
+            reference_list = references[i]
+            task_score = 0.0
+            for reference in reference_list:
+                if self.language == 'en':
+                    normalized_prediction = normalize_answer(prediction)
+                    normalized_reference = normalize_answer(reference)
+
+                    prediction_tokens = normalized_prediction.split()
+                    reference_tokens = normalized_reference.split()
+
+                else:
+                    prediction_tokens = list(
+                        jieba.cut(prediction, cut_all=False))
+                    reference_tokens = list(jieba.cut(reference,
+                                                      cut_all=False))
+                    prediction_tokens = [
+                        normalize_zh_answer(token)
+                        for token in prediction_tokens
+                    ]
+                    reference_tokens = [
+                        normalize_zh_answer(token)
+                        for token in reference_tokens
+                    ]
+                    prediction_tokens = [
+                        token for token in prediction_tokens if len(token) > 0
+                    ]
+                    reference_tokens = [
+                        token for token in reference_tokens if len(token) > 0
+                    ]
+
+                task_score = max(task_score,
+                                 f1_score(prediction_tokens, reference_tokens))
+                break
+
+            score += task_score
+
+        score = score / len(predictions) * 100
+        return {'f1': score}
+
+
+@ICL_EVALUATORS.register_module()
+class LVEvalOPTF1Evaluator(BaseEvaluator):
+
+    def __init__(self, language: str = 'en') -> None:
+        super().__init__()
+        assert language in ['en', 'zh']
+        self.language = language
+
+    def score(self, predictions: List, references: List) -> dict:
+
+        def f1_score(prediction, reference, **kwargs):
+            common = Counter(prediction) & Counter(reference)
+            num_same = sum(common.values())
+            if num_same == 0:
+                return 0
+            precision = 1.0 * num_same / len(prediction)
+            recall = 1.0 * num_same / len(reference)
+            f1 = (2 * precision * recall) / (precision + recall)
+            return f1
+
+        score = 0.0
+        for i in range(len(predictions)):
+            prediction = predictions[i]
+            reference_list = references[i]
+            answer_keyword = reference_list[-1]
+            task_score = 0.0
+            for reference in reference_list:
+                if self.language == 'en':
+                    normalized_prediction = normalize_answer(prediction)
+                    normalized_reference = normalize_answer(reference)
+
+                    prediction_tokens = normalized_prediction.split()
+                    reference_tokens = normalized_reference.split()
+                    # answer keywords recall
+                    if answer_keyword:
+                        answer_keyword_tokens = normalize_answer(
+                            answer_keyword)
+                        answer_keyword_tokens = answer_keyword_tokens.split()
+                        common = Counter(prediction_tokens) & Counter(
+                            answer_keyword_tokens)
+                        filtered_common = {
+                            key: value
+                            for key, value in common.items()
+                            if key not in ABANDON_WORDS_EN
+                        }
+                        num_same = sum(filtered_common.values())
+                        recall = 1.0 * num_same / len(answer_keyword_tokens)
+                        if recall < 0.2:
+                            break
+                else:
+                    prediction_tokens = list(
+                        jieba.cut(prediction, cut_all=False))
+                    reference_tokens = list(jieba.cut(reference,
+                                                      cut_all=False))
+                    prediction_tokens = [
+                        normalize_zh_answer(token)
+                        for token in prediction_tokens
+                    ]
+                    reference_tokens = [
+                        normalize_zh_answer(token)
+                        for token in reference_tokens
+                    ]
+                    prediction_tokens = [
+                        token for token in prediction_tokens if len(token) > 0
+                    ]
+                    reference_tokens = [
+                        token for token in reference_tokens if len(token) > 0
+                    ]
+                    if not answer_keyword:
+                        answer_keyword = reference
+                    if answer_keyword:
+                        answer_keyword_tokens = list(
+                            jieba.cut(answer_keyword, cut_all=False))
+                        answer_keyword_tokens = [
+                            normalize_zh_answer(token)
+                            for token in answer_keyword_tokens
+                        ]
+                        answer_keyword_tokens = [
+                            token for token in answer_keyword_tokens
+                            if len(token) > 0
+                        ]
+                        common = Counter(prediction_tokens) & Counter(
+                            answer_keyword_tokens)
+                        filtered_common = {
+                            key: value
+                            for key, value in common.items()
+                            if key not in ABANDON_WORDS_ZH
+                        }
+                        num_same = sum(filtered_common.values())
+                        recall = 1.0 * num_same / len(answer_keyword_tokens)
+                        if recall < 0.4:
+                            break
+
+                task_score = max(task_score,
+                                 f1_score(prediction_tokens, reference_tokens))
+                break
+
+            score += task_score
+
+        score = score / len(predictions) * 100
+        return {'LVEval_f1': score}
+
+
+@ICL_EVALUATORS.register_module()
+class LVEvalOPTRougeEvaluator(BaseEvaluator):
+
+    def __init__(self, language: str = 'en') -> None:
+        super().__init__()
+        assert language in ['en', 'zh']
+        self.language = language
+
+    def score(self, predictions: List, references: List) -> dict:
+        score = 0.0
+        for i in range(len(predictions)):
+            prediction = predictions[i]
+            reference_list = references[i]
+            task_score = 0.0
+            for reference in reference_list:
+
+                if self.language == 'zh':
+                    word_blacklist = ABANDON_WORDS_ZH
+                    prediction_tokens = list(
+                        jieba.cut(prediction, cut_all=False))
+                    reference_tokens = list(jieba.cut(reference,
+                                                      cut_all=False))
+                    prediction_tokens = [
+                        normalize_zh_answer(token)
+                        for token in prediction_tokens
+                    ]
+                    reference_tokens = [
+                        normalize_zh_answer(token)
+                        for token in reference_tokens
+                    ]
+                else:
+                    word_blacklist = ABANDON_WORDS_EN
+                    prediction_tokens = normalize_answer(prediction)
+                    reference_tokens = normalize_answer(reference)
+                    prediction_tokens = prediction_tokens.split()
+                    reference_tokens = reference_tokens.split()
+
+                filtered_prediction_tokens = [
+                    i for i in prediction_tokens if i not in word_blacklist
+                ]
+                filtered_reference_tokens = [
+                    i for i in reference_tokens if i not in word_blacklist
+                ]
+                prediction = ' '.join(filtered_prediction_tokens)
+                reference = ' '.join(filtered_reference_tokens)
+
+                rouge = Rouge()
+                try:
+                    cur_score = rouge.get_scores([prediction], [reference],
+                                                 avg=True)['rouge-l']['f']
+                except Exception:
+                    cur_score = 0.0
+                task_score = max(task_score, cur_score)
+                break
+
+            score += task_score
+
+        score = score / len(predictions) * 100
+        return {'LVEval_rouge': score}
--- a/opencompass/datasets/lveval/lveval_cmrc_mixup.py
+++ b/opencompass/datasets/lveval/lveval_cmrc_mixup.py
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LVEvalcmrcDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            confusing_facts = dataset[split]['confusing_facts'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers,
+                'confusing_facts': confusing_facts,
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/lveval/lveval_dureader_mixup.py
+++ b/opencompass/datasets/lveval/lveval_dureader_mixup.py
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LVEvaldureaderDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers,
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/lveval/lveval_factrecall_en.py
+++ b/opencompass/datasets/lveval/lveval_factrecall_en.py
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LVEvalfactrecallenDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            confusing_facts = dataset[split]['confusing_facts'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers,
+                'confusing_facts': confusing_facts,
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/lveval/lveval_factrecall_zh.py
+++ b/opencompass/datasets/lveval/lveval_factrecall_zh.py
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LVEvalfactrecallzhDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            confusing_facts = dataset[split]['confusing_facts'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers,
+                'confusing_facts': confusing_facts,
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py
+++ b/opencompass/datasets/lveval/lveval_hotpotwikiqa_mixup.py
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LVEvalhotpotwikiqaDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            confusing_facts = dataset[split]['confusing_facts'][i]
+            answer_keywords = dataset[split]['answer_keywords'][i]
+            answers_with_ak = answers + [answer_keywords]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers_with_ak,
+                'confusing_facts': confusing_facts,
+                'answer_keywords': answer_keywords,
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/lveval/lveval_lic_mixup.py
+++ b/opencompass/datasets/lveval/lveval_lic_mixup.py
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LVEvallicDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            confusing_facts = dataset[split]['confusing_facts'][i]
+            answer_keywords = dataset[split]['answer_keywords'][i]
+            answers_with_ak = answers + [answer_keywords]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers_with_ak,
+                'confusing_facts': confusing_facts,
+                'answer_keywords': answer_keywords,
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py
+++ b/opencompass/datasets/lveval/lveval_loogle_CR_mixup.py
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LVEvallooglecrDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            answer_keywords = dataset[split]['answer_keywords'][i]
+            answers_with_ak = answers + [answer_keywords]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers_with_ak,
+                'answer_keywords': answer_keywords,
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py
+++ b/opencompass/datasets/lveval/lveval_loogle_MIR_mixup.py
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LVEvallooglemirDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            answer_keywords = dataset[split]['answer_keywords'][i]
+            answers_with_ak = answers + [answer_keywords]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers_with_ak,
+                'answer_keywords': answer_keywords,
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py
+++ b/opencompass/datasets/lveval/lveval_loogle_SD_mixup.py
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LVEvallooglesdDataset(BaseDataset):
+
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            answer_keywords = dataset[split]['answer_keywords'][i]
+            answers_with_ak = answers + [answer_keywords]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers_with_ak,
+                'answer_keywords': answer_keywords,
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset