[Dataset] LongBench (#236)

Co-authored-by: wangchonghua <wangchonghua@pjlab.org.cn>

[Dataset] LongBench (#236)
Co-authored-by: wangchonghua <wangchonghua@pjlab.org.cn>
655a807f · philipwangOvO · GitHub · c6a34949 · 655a807f · 655a807f
Unverified Commit 655a807f authored Aug 21, 2023 by philipwangOvO Committed by GitHub Aug 21, 2023
20 changed files
--- a/configs/summarizers/longbench.py
+++ b/configs/summarizers/longbench.py
+summarizer = dict(
+    dataset_abbrs = [
+        '--------- LongBench Single-Document QA ---------', # category
+        "LongBench_narrativeqa",
+        'LongBench_qasper',
+        'LongBench_multifieldqa_en',
+        "LongBench_multifieldqa_zh",
+        '--------- LongBench Multi-Document QA ---------', # category
+        'LongBench_hotpotqa',
+        'LongBench_2wikimqa',
+        'LongBench_musique',
+        'LongBench_dureader',
+        '--------- LongBench Summarization ---------', # category
+        'LongBench_gov_report',
+        'LongBench_qmsum',
+        'LongBench_vcsum',
+        '--------- LongBench Few-shot Learning ---------', # category
+        'LongBench_trec',
+        'LongBench_nq',
+        'LongBench_triviaqa',
+        'LongBench_lsht',
+        '--------- LongBench Code Completion ---------', # category
+        'LongBench_lcc',
+        'LongBench_repobench-p',
+        '--------- LongBench Synthetic Tasks ---------', # category
+        'LongBench_passage_retrieval_en',
+        'LongBench_passage_count',
+        'LongBench_passage_retrieval_zh',
+    ],
+    summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
+    prompt_db=dict(
+        database_path='configs/datasets/log.json',
+        config_dir='configs/datasets',
+        blacklist='.promptignore'),
+)
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -54,6 +54,7 @@ from .LEval_scientific_qa import *  # noqa: F401, F403
 from .LEval_topic_retrieval import *  # noqa: F401, F403
 from .LEval_tpo import *  # noqa: F401, F403
 from .LEval_tvshow_summ import *  # noqa: F401, F403
+from .longbench import *  # noqa: F401, F403
 from .math import *  # noqa: F401, F403
 from .mbpp import *  # noqa: F401, F403
 from .mmlu import *  # noqa: F401, F403

--- a/opencompass/datasets/longbench/__init__.py
+++ b/opencompass/datasets/longbench/__init__.py
+from .evaluators import LongBenchClassificationEvaluator  # noqa: F401, F403
+from .evaluators import LongBenchCodeSimEvaluator  # noqa: F401, F403
+from .evaluators import LongBenchCountEvaluator  # noqa: F401, F403
+from .evaluators import LongBenchF1Evaluator  # noqa: F401, F403
+from .evaluators import LongBenchRetrievalEvaluator  # noqa: F401, F403
+from .evaluators import LongBenchRougeEvaluator  # noqa: F401, F403
+from .longbench_2wikim_qa import *  # noqa: F401, F403
+from .longbench_dureader import *  # noqa: F401, F403
+from .longbench_gov_report import *  # noqa: F401, F403
+from .longbench_hotpot_qa import *  # noqa: F401, F403
+from .longbench_lcc import *  # noqa: F401, F403
+from .longbench_lsht import *  # noqa: F401, F403
+from .longbench_multifieldqa_en import *  # noqa: F401, F403
+from .longbench_multifieldqa_zh import *  # noqa: F401, F403
+from .longbench_musique import *  # noqa: F401, F403
+from .longbench_narrative_qa import *  # noqa: F401, F403
+from .longbench_nq import *  # noqa: F401, F403
+from .longbench_passage_count import *  # noqa: F401, F403
+from .longbench_passage_retrieval_en import *  # noqa: F401, F403
+from .longbench_passage_retrieval_zh import *  # noqa: F401, F403
+from .longbench_qasper import *  # noqa: F401, F403
+from .longbench_qmsum import *  # noqa: F401, F403
+from .longbench_repobench import *  # noqa: F401, F403
+from .longbench_trec import *  # noqa: F401, F403
+from .longbench_trivia_qa import *  # noqa: F401, F403
+from .longbench_vcsum import *  # noqa: F401, F403
--- a/opencompass/datasets/longbench/evaluators.py
+++ b/opencompass/datasets/longbench/evaluators.py
+import difflib
+import re
+import string
+from collections import Counter
+from typing import List
+import jieba
+from fuzzywuzzy import fuzz
+from rouge import Rouge
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+    def white_space_fix(text):
+        return ' '.join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def normalize_zh_answer(s):
+    """Lower text and remove punctuation, extra whitespace."""
+    def white_space_fix(text):
+        return ''.join(text.split())
+    def remove_punc(text):
+        cn_punctuation = '！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀\
+            ｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.'
+        all_punctuation = set(string.punctuation + cn_punctuation)
+        return ''.join(ch for ch in text if ch not in all_punctuation)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_punc(lower(s)))
+@ICL_EVALUATORS.register_module()
+class LongBenchF1Evaluator(BaseEvaluator):
+    def __init__(self, language: str = 'en') -> None:
+        super().__init__()
+        assert language in ['en', 'zh']
+        self.language = language
+    def score(self, predictions: List, references: List) -> dict:
+        def f1_score(prediction, reference, **kwargs):
+            common = Counter(prediction) & Counter(reference)
+            num_same = sum(common.values())
+            if num_same == 0:
+                return 0
+            precision = 1.0 * num_same / len(prediction)
+            recall = 1.0 * num_same / len(reference)
+            f1 = (2 * precision * recall) / (precision + recall)
+            return f1
+        score = 0.
+        for i in range(len(predictions)):
+            prediction = predictions[i]
+            reference_list = references[i]
+            task_score = 0.
+            for reference in reference_list:
+                if self.language == 'en':
+                    normalized_prediction = normalize_answer(prediction)
+                    normalized_reference = normalize_answer(reference)
+                    prediction_tokens = normalized_prediction.split()
+                    reference_tokens = normalized_reference.split()
+                else:
+                    prediction_tokens = list(
+                        jieba.cut(prediction, cut_all=False))
+                    reference_tokens = list(jieba.cut(reference,
+                                                      cut_all=False))
+                    prediction_tokens = [
+                        normalize_zh_answer(token)
+                        for token in prediction_tokens
+                    ]
+                    reference_tokens = [
+                        normalize_zh_answer(token)
+                        for token in reference_tokens
+                    ]
+                    prediction_tokens = [
+                        token for token in prediction_tokens if len(token) > 0
+                    ]
+                    reference_tokens = [
+                        token for token in reference_tokens if len(token) > 0
+                    ]
+                task_score = max(task_score,
+                                 f1_score(prediction_tokens, reference_tokens))
+            score += task_score
+        score = score / len(predictions) * 100
+        return {'score': score}
+@ICL_EVALUATORS.register_module()
+class LongBenchCountEvaluator(BaseEvaluator):
+    def score(self, predictions: List, references: List) -> dict:
+        score = 0.
+        for i in range(len(predictions)):
+            prediction = predictions[i]
+            reference_list = references[i]
+            for reference in reference_list:
+                numbers = re.findall(r'\d+', prediction)
+                right_num = 0
+                for number in numbers:
+                    if str(number) == str(reference):
+                        right_num += 1
+                score += 0.0 if len(numbers) == 0 else float(right_num /
+                                                             len(numbers))
+        score = score / len(predictions) * 100
+        return {'score': score}
+@ICL_EVALUATORS.register_module()
+class LongBenchRetrievalEvaluator(BaseEvaluator):
+    def __init__(self, language: str = 'en') -> None:
+        super().__init__()
+        assert language in ['en', 'zh']
+        self.language = language
+    def score(self, predictions: List, references: List) -> dict:
+        score = 0.
+        for i in range(len(predictions)):
+            prediction = predictions[i]
+            reference_list = references[i]
+            for reference in reference_list:
+                if self.language == 'en':
+                    pattern = r'Paragraph (\d+)'
+                else:
+                    pattern = r'段落(\d+)'
+                matches = re.findall(pattern, reference)
+                reference_id = matches[0]
+                numbers = re.findall(r'\d+', prediction)
+                right_num = 0
+                for number in numbers:
+                    if str(number) == str(reference_id):
+                        right_num += 1
+                score += 0.0 if len(numbers) == 0 else float(right_num /
+                                                             len(numbers))
+        score = score / len(predictions) * 100
+        return {'score': score}
+@ICL_EVALUATORS.register_module()
+class LongBenchRougeEvaluator(BaseEvaluator):
+    def __init__(self, language: str = 'en') -> None:
+        super().__init__()
+        assert language in ['en', 'zh']
+        self.language = language
+    def score(self, predictions: List, references: List) -> dict:
+        score = 0.
+        for i in range(len(predictions)):
+            prediction = predictions[i]
+            reference_list = references[i]
+            task_score = 0.
+            for reference in reference_list:
+                if self.language == 'zh':
+                    prediction = ' '.join(
+                        list(jieba.cut(prediction, cut_all=False)))
+                    reference = ' '.join(
+                        list(jieba.cut(reference, cut_all=False)))
+                rouge = Rouge()
+                if prediction != '':
+                    cur_score = rouge.get_scores([prediction], [reference],
+                                                 avg=True)['rouge-l']['f']
+                else:
+                    cur_score = 0.
+                task_score = max(task_score, cur_score)
+            score += task_score
+        score = score / len(predictions) * 100
+        return {'score': score}
+@ICL_EVALUATORS.register_module()
+class LongBenchCodeSimEvaluator(BaseEvaluator):
+    def score(self, predictions: List, references: List) -> dict:
+        score = 0.
+        for i in range(len(predictions)):
+            prediction = predictions[i]
+            reference_list = references[i]
+            task_score = 0.
+            for reference in reference_list:
+                all_lines = prediction.lstrip('\n').split('\n')
+                prediction = ''
+                for line in all_lines:
+                    if ('`' not in line) and ('#'
+                                              not in line) and ('//'
+                                                                not in line):
+                        prediction = line
+                        break
+                task_score = max(task_score,
+                                 (fuzz.ratio(prediction, reference) / 100))
+            score += task_score
+        score = score / len(predictions) * 100
+        return {'score': score}
+@ICL_EVALUATORS.register_module()
+class LongBenchClassificationEvaluator(BaseEvaluator):
+    def score(self, predictions: List, references: List) -> dict:
+        score = 0.
+        for i in range(len(predictions)):
+            prediction = predictions[i]
+            reference_list = references[i]['answers']
+            for reference in reference_list:
+                em_match_list = []
+                all_classes = references[i]['all_classes']
+                for class_name in all_classes:
+                    if class_name in prediction:
+                        em_match_list.append(class_name)
+                for match_term in em_match_list:
+                    if match_term in reference and match_term != reference:
+                        em_match_list.remove(match_term)
+                if em_match_list != 0:
+                    if reference in em_match_list:
+                        score += (1.0 / len(em_match_list))
+                else:
+                    best_match = None
+                    highest_similarity = 0
+                    for names in all_classes:
+                        similarity = difflib.SequenceMatcher(
+                            None, names, prediction).ratio()
+                        if similarity > highest_similarity:
+                            highest_similarity = similarity
+                            best_match = names
+                    score += float(best_match == reference)
+        score = score / len(predictions) * 100
+        return {'score': score}
--- a/opencompass/datasets/longbench/longbench_2wikim_qa.py
+++ b/opencompass/datasets/longbench/longbench_2wikim_qa.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBench2wikimqaDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_dureader.py
+++ b/opencompass/datasets/longbench/longbench_dureader.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchdureaderDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_gov_report.py
+++ b/opencompass/datasets/longbench/longbench_gov_report.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchgov_reportDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({'context': context, 'answers': answers})
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_hotpot_qa.py
+++ b/opencompass/datasets/longbench/longbench_hotpot_qa.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchhotpotqaDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_lcc.py
+++ b/opencompass/datasets/longbench/longbench_lcc.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchlccDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({'context': context, 'answers': answers})
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_lsht.py
+++ b/opencompass/datasets/longbench/longbench_lsht.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchlshtDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            all_classes = dataset[split]['all_classes'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'all_labels': {
+                    'answers': answers,
+                    'all_classes': all_classes
+                }
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_multifieldqa_en.py
+++ b/opencompass/datasets/longbench/longbench_multifieldqa_en.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchmultifieldqa_enDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_multifieldqa_zh.py
+++ b/opencompass/datasets/longbench/longbench_multifieldqa_zh.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchmultifieldqa_zhDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_musique.py
+++ b/opencompass/datasets/longbench/longbench_musique.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchmusiqueDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_narrative_qa.py
+++ b/opencompass/datasets/longbench/longbench_narrative_qa.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchnarrativeqaDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_nq.py
+++ b/opencompass/datasets/longbench/longbench_nq.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchnqDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_passage_count.py
+++ b/opencompass/datasets/longbench/longbench_passage_count.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchpassage_countDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({'context': context, 'answers': answers})
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_passage_retrieval_en.py
+++ b/opencompass/datasets/longbench/longbench_passage_retrieval_en.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchpassage_retrieval_enDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py
+++ b/opencompass/datasets/longbench/longbench_passage_retrieval_zh.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchpassage_retrieval_zhDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_qasper.py
+++ b/opencompass/datasets/longbench/longbench_qasper.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchqasperDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_qmsum.py
+++ b/opencompass/datasets/longbench/longbench_qmsum.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchqmsumDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            question = dataset[split]['input'][i]
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({
+                'input': question,
+                'context': context,
+                'answers': answers
+            })
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset