[Sync] Update LongEval (#443)

3bb3d330 · philipwangOvO · GitHub · 2bb7beec · 3bb3d330 · 3bb3d330
Unverified Commit 3bb3d330 authored Sep 27, 2023 by philipwangOvO Committed by GitHub Sep 27, 2023
6 changed files
--- a/opencompass/datasets/leval/evaluators.py
+++ b/opencompass/datasets/leval/evaluators.py
@@ -4,6 +4,7 @@ from typing import List
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS
 from opencompass.utils.prompt import PromptList
+from opencompass.utils.text_postprocessors import general_postprocess
 @ICL_EVALUATORS.register_module()
@@ -107,3 +108,32 @@ class LEvalGPTEvaluator(BaseEvaluator):
        score = score / (num_samples - bad_case) * 100
        return {'score': score}
+@ICL_EVALUATORS.register_module()
+class LEvalEMEvaluator(BaseEvaluator):
+    """Exact match evaluator."""
+    def __init__(self) -> None:
+        super().__init__()
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        predictions = [
+            general_postprocess(prediction) for prediction in predictions
+        ]
+        processed_answers = [general_postprocess(i) for i in references]
+        cnt = 0
+        for pred, ans, origin_ans in zip(predictions, processed_answers,
+                                         references):
+            if ans in pred or origin_ans in pred:
+                cnt += 1
+        score = cnt / len(predictions) * 100
+        return {'score': score}
--- a/opencompass/datasets/longbench/__init__.py
+++ b/opencompass/datasets/longbench/__init__.py
@@ -10,17 +10,18 @@ from .longbench_gov_report import *  # noqa: F401, F403
 from .longbench_hotpot_qa import *  # noqa: F401, F403
 from .longbench_lcc import *  # noqa: F401, F403
 from .longbench_lsht import *  # noqa: F401, F403
+from .longbench_multi_news import *  # noqa: F401, F403
 from .longbench_multifieldqa_en import *  # noqa: F401, F403
 from .longbench_multifieldqa_zh import *  # noqa: F401, F403
 from .longbench_musique import *  # noqa: F401, F403
 from .longbench_narrative_qa import *  # noqa: F401, F403
-from .longbench_nq import *  # noqa: F401, F403
 from .longbench_passage_count import *  # noqa: F401, F403
 from .longbench_passage_retrieval_en import *  # noqa: F401, F403
 from .longbench_passage_retrieval_zh import *  # noqa: F401, F403
 from .longbench_qasper import *  # noqa: F401, F403
 from .longbench_qmsum import *  # noqa: F401, F403
 from .longbench_repobench import *  # noqa: F401, F403
+from .longbench_samsum import *  # noqa: F401, F403
 from .longbench_trec import *  # noqa: F401, F403
 from .longbench_trivia_qa import *  # noqa: F401, F403
 from .longbench_vcsum import *  # noqa: F401, F403
--- a/opencompass/datasets/longbench/evaluators.py
+++ b/opencompass/datasets/longbench/evaluators.py
@@ -189,10 +189,10 @@ class LongBenchRougeEvaluator(BaseEvaluator):
                        list(jieba.cut(reference, cut_all=False)))
                rouge = Rouge()
-                if prediction != '':
+                try:
                    cur_score = rouge.get_scores([prediction], [reference],
                                                 avg=True)['rouge-l']['f']
-                else:
+                except Exception:
                    cur_score = 0.
                task_score = max(task_score, cur_score)

--- a/opencompass/datasets/longbench/longbench_multi_news.py
+++ b/opencompass/datasets/longbench/longbench_multi_news.py
+from datasets import Dataset, load_dataset
+from opencompass.registry import LOAD_DATASET
+from ..base import BaseDataset
+@LOAD_DATASET.register_module()
+class LongBenchmulti_newsDataset(BaseDataset):
+    @staticmethod
+    def load(**kwargs):
+        dataset = load_dataset(**kwargs)
+        split = 'test'
+        raw_data = []
+        for i in range(len(dataset[split])):
+            context = dataset[split]['context'][i]
+            answers = dataset[split]['answers'][i]
+            raw_data.append({'context': context, 'answers': answers})
+        dataset[split] = Dataset.from_list(raw_data)
+        return dataset
--- a/opencompass/datasets/longbench/longbench_nq.py
+++ b/opencompass/datasets/longbench/longbench_nq.py
@@ -6,7 +6,7 @@ from ..base import BaseDataset
 @LOAD_DATASET.register_module()
-class LongBenchnqDataset(BaseDataset):
+class LongBenchsamsumDataset(BaseDataset):
    @staticmethod
    def load(**kwargs):

--- a/opencompass/models/huggingface.py
+++ b/opencompass/models/huggingface.py
@@ -42,6 +42,9 @@ class HuggingFace(BaseModel):
            without batch padding.
        pad_token_id (int): The id of the padding token. Defaults to None. Use
            (#vocab + pad_token_id) if get negative value.
+        mode (str, optional): The method of input truncation when input length
+            exceeds max_seq_len. 'mid' represents the part of input to
+            truncate. Defaults to 'none'.
    Note:
        About ``extract_pred_after_decode``: Commonly, we should extract the
@@ -62,7 +65,8 @@ class HuggingFace(BaseModel):
                 meta_template: Optional[Dict] = None,
                 extract_pred_after_decode: bool = False,
                 batch_padding: bool = False,
-                 pad_token_id: Optional[int] = None):
+                 pad_token_id: Optional[int] = None,
+                 mode: str = 'none'):
        super().__init__(path=path,
                         max_seq_len=max_seq_len,
                         tokenizer_only=tokenizer_only,
@@ -73,6 +77,8 @@ class HuggingFace(BaseModel):
        patch_hf_auto_model(hf_cache_dir)
        self.logger = get_logger()
        self.pad_token_id = pad_token_id
+        assert mode in ['none', 'mid']
+        self.mode = mode
        self._load_tokenizer(path=path,
                             tokenizer_path=tokenizer_path,
                             tokenizer_kwargs=tokenizer_kwargs)
@@ -228,6 +234,18 @@ class HuggingFace(BaseModel):
        if self.extract_pred_after_decode:
            prompt_lens = [len(input_) for input_ in inputs]
+        if self.mode == 'mid':
+            input_ids = self.tokenizer(inputs, truncation=False)['input_ids']
+            input_ids = torch.tensor(input_ids, device=self.model.device)
+            if len(input_ids[0]) > self.max_seq_len - max_out_len:
+                half = int((self.max_seq_len - max_out_len) / 2)
+                inputs = [
+                    self.tokenizer.decode(input_ids[0][:half],
+                                          skip_special_tokens=True) +
+                    self.tokenizer.decode(input_ids[0][-half:],
+                                          skip_special_tokens=True)
+                ]
        input_ids = self.tokenizer(inputs,
                                   truncation=True,
                                   max_length=self.max_seq_len -