Unverified Commit 3bb3d330 authored by philipwangOvO's avatar philipwangOvO Committed by GitHub
Browse files

[Sync] Update LongEval (#443)

parent 2bb7beec
...@@ -4,6 +4,7 @@ from typing import List ...@@ -4,6 +4,7 @@ from typing import List
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS from opencompass.registry import ICL_EVALUATORS
from opencompass.utils.prompt import PromptList from opencompass.utils.prompt import PromptList
from opencompass.utils.text_postprocessors import general_postprocess
@ICL_EVALUATORS.register_module() @ICL_EVALUATORS.register_module()
...@@ -107,3 +108,32 @@ class LEvalGPTEvaluator(BaseEvaluator): ...@@ -107,3 +108,32 @@ class LEvalGPTEvaluator(BaseEvaluator):
score = score / (num_samples - bad_case) * 100 score = score / (num_samples - bad_case) * 100
return {'score': score} return {'score': score}
@ICL_EVALUATORS.register_module()
class LEvalEMEvaluator(BaseEvaluator):
"""Exact match evaluator."""
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
predictions = [
general_postprocess(prediction) for prediction in predictions
]
processed_answers = [general_postprocess(i) for i in references]
cnt = 0
for pred, ans, origin_ans in zip(predictions, processed_answers,
references):
if ans in pred or origin_ans in pred:
cnt += 1
score = cnt / len(predictions) * 100
return {'score': score}
...@@ -10,17 +10,18 @@ from .longbench_gov_report import * # noqa: F401, F403 ...@@ -10,17 +10,18 @@ from .longbench_gov_report import * # noqa: F401, F403
from .longbench_hotpot_qa import * # noqa: F401, F403 from .longbench_hotpot_qa import * # noqa: F401, F403
from .longbench_lcc import * # noqa: F401, F403 from .longbench_lcc import * # noqa: F401, F403
from .longbench_lsht import * # noqa: F401, F403 from .longbench_lsht import * # noqa: F401, F403
from .longbench_multi_news import * # noqa: F401, F403
from .longbench_multifieldqa_en import * # noqa: F401, F403 from .longbench_multifieldqa_en import * # noqa: F401, F403
from .longbench_multifieldqa_zh import * # noqa: F401, F403 from .longbench_multifieldqa_zh import * # noqa: F401, F403
from .longbench_musique import * # noqa: F401, F403 from .longbench_musique import * # noqa: F401, F403
from .longbench_narrative_qa import * # noqa: F401, F403 from .longbench_narrative_qa import * # noqa: F401, F403
from .longbench_nq import * # noqa: F401, F403
from .longbench_passage_count import * # noqa: F401, F403 from .longbench_passage_count import * # noqa: F401, F403
from .longbench_passage_retrieval_en import * # noqa: F401, F403 from .longbench_passage_retrieval_en import * # noqa: F401, F403
from .longbench_passage_retrieval_zh import * # noqa: F401, F403 from .longbench_passage_retrieval_zh import * # noqa: F401, F403
from .longbench_qasper import * # noqa: F401, F403 from .longbench_qasper import * # noqa: F401, F403
from .longbench_qmsum import * # noqa: F401, F403 from .longbench_qmsum import * # noqa: F401, F403
from .longbench_repobench import * # noqa: F401, F403 from .longbench_repobench import * # noqa: F401, F403
from .longbench_samsum import * # noqa: F401, F403
from .longbench_trec import * # noqa: F401, F403 from .longbench_trec import * # noqa: F401, F403
from .longbench_trivia_qa import * # noqa: F401, F403 from .longbench_trivia_qa import * # noqa: F401, F403
from .longbench_vcsum import * # noqa: F401, F403 from .longbench_vcsum import * # noqa: F401, F403
...@@ -189,10 +189,10 @@ class LongBenchRougeEvaluator(BaseEvaluator): ...@@ -189,10 +189,10 @@ class LongBenchRougeEvaluator(BaseEvaluator):
list(jieba.cut(reference, cut_all=False))) list(jieba.cut(reference, cut_all=False)))
rouge = Rouge() rouge = Rouge()
if prediction != '': try:
cur_score = rouge.get_scores([prediction], [reference], cur_score = rouge.get_scores([prediction], [reference],
avg=True)['rouge-l']['f'] avg=True)['rouge-l']['f']
else: except Exception:
cur_score = 0. cur_score = 0.
task_score = max(task_score, cur_score) task_score = max(task_score, cur_score)
......
from datasets import Dataset, load_dataset
from opencompass.registry import LOAD_DATASET
from ..base import BaseDataset
@LOAD_DATASET.register_module()
class LongBenchmulti_newsDataset(BaseDataset):
@staticmethod
def load(**kwargs):
dataset = load_dataset(**kwargs)
split = 'test'
raw_data = []
for i in range(len(dataset[split])):
context = dataset[split]['context'][i]
answers = dataset[split]['answers'][i]
raw_data.append({'context': context, 'answers': answers})
dataset[split] = Dataset.from_list(raw_data)
return dataset
...@@ -6,7 +6,7 @@ from ..base import BaseDataset ...@@ -6,7 +6,7 @@ from ..base import BaseDataset
@LOAD_DATASET.register_module() @LOAD_DATASET.register_module()
class LongBenchnqDataset(BaseDataset): class LongBenchsamsumDataset(BaseDataset):
@staticmethod @staticmethod
def load(**kwargs): def load(**kwargs):
......
...@@ -42,6 +42,9 @@ class HuggingFace(BaseModel): ...@@ -42,6 +42,9 @@ class HuggingFace(BaseModel):
without batch padding. without batch padding.
pad_token_id (int): The id of the padding token. Defaults to None. Use pad_token_id (int): The id of the padding token. Defaults to None. Use
(#vocab + pad_token_id) if get negative value. (#vocab + pad_token_id) if get negative value.
mode (str, optional): The method of input truncation when input length
exceeds max_seq_len. 'mid' represents the part of input to
truncate. Defaults to 'none'.
Note: Note:
About ``extract_pred_after_decode``: Commonly, we should extract the About ``extract_pred_after_decode``: Commonly, we should extract the
...@@ -62,7 +65,8 @@ class HuggingFace(BaseModel): ...@@ -62,7 +65,8 @@ class HuggingFace(BaseModel):
meta_template: Optional[Dict] = None, meta_template: Optional[Dict] = None,
extract_pred_after_decode: bool = False, extract_pred_after_decode: bool = False,
batch_padding: bool = False, batch_padding: bool = False,
pad_token_id: Optional[int] = None): pad_token_id: Optional[int] = None,
mode: str = 'none'):
super().__init__(path=path, super().__init__(path=path,
max_seq_len=max_seq_len, max_seq_len=max_seq_len,
tokenizer_only=tokenizer_only, tokenizer_only=tokenizer_only,
...@@ -73,6 +77,8 @@ class HuggingFace(BaseModel): ...@@ -73,6 +77,8 @@ class HuggingFace(BaseModel):
patch_hf_auto_model(hf_cache_dir) patch_hf_auto_model(hf_cache_dir)
self.logger = get_logger() self.logger = get_logger()
self.pad_token_id = pad_token_id self.pad_token_id = pad_token_id
assert mode in ['none', 'mid']
self.mode = mode
self._load_tokenizer(path=path, self._load_tokenizer(path=path,
tokenizer_path=tokenizer_path, tokenizer_path=tokenizer_path,
tokenizer_kwargs=tokenizer_kwargs) tokenizer_kwargs=tokenizer_kwargs)
...@@ -228,6 +234,18 @@ class HuggingFace(BaseModel): ...@@ -228,6 +234,18 @@ class HuggingFace(BaseModel):
if self.extract_pred_after_decode: if self.extract_pred_after_decode:
prompt_lens = [len(input_) for input_ in inputs] prompt_lens = [len(input_) for input_ in inputs]
if self.mode == 'mid':
input_ids = self.tokenizer(inputs, truncation=False)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device)
if len(input_ids[0]) > self.max_seq_len - max_out_len:
half = int((self.max_seq_len - max_out_len) / 2)
inputs = [
self.tokenizer.decode(input_ids[0][:half],
skip_special_tokens=True) +
self.tokenizer.decode(input_ids[0][-half:],
skip_special_tokens=True)
]
input_ids = self.tokenizer(inputs, input_ids = self.tokenizer(inputs,
truncation=True, truncation=True,
max_length=self.max_seq_len - max_length=self.max_seq_len -
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment