Unverified Commit dbb20b82 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] update (#517)

parent 6f07af30
...@@ -13,9 +13,9 @@ class CMBDataset(BaseDataset): ...@@ -13,9 +13,9 @@ class CMBDataset(BaseDataset):
@staticmethod @staticmethod
def load(path: str): def load(path: str):
with open(osp.join(path, 'test.json'), 'r') as f: with open(osp.join(path, 'test.json'), 'r', encoding='utf-8') as f:
test_data = json.load(f) test_data = json.load(f)
with open(osp.join(path, 'val.json'), 'r') as f: with open(osp.join(path, 'val.json'), 'r', encoding='utf-8') as f:
val_data = json.load(f) val_data = json.load(f)
for da in test_data: for da in test_data:
......
...@@ -13,7 +13,7 @@ class cmnliDataset_V2(BaseDataset): ...@@ -13,7 +13,7 @@ class cmnliDataset_V2(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
data = [] data = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
if line['label'] == '-': if line['label'] == '-':
......
...@@ -12,7 +12,7 @@ class CMRCDataset(BaseDataset): ...@@ -12,7 +12,7 @@ class CMRCDataset(BaseDataset):
@staticmethod @staticmethod
def load(path: str): def load(path: str):
with open(path) as f: with open(path, 'r', encoding='utf-8') as f:
data = json.load(f) data = json.load(f)
# 将原始数据转换为所需的格式 # 将原始数据转换为所需的格式
rows = [] rows = []
......
...@@ -13,7 +13,7 @@ class COPADataset_V2(BaseDataset): ...@@ -13,7 +13,7 @@ class COPADataset_V2(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
dataset = [] dataset = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
line['label'] = 'AB'[line['label']] line['label'] = 'AB'[line['label']]
......
...@@ -31,7 +31,7 @@ class CslDataset_V2(BaseDataset): ...@@ -31,7 +31,7 @@ class CslDataset_V2(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
data = [] data = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
item = { item = {
......
...@@ -12,7 +12,7 @@ class DRCDDataset(BaseDataset): ...@@ -12,7 +12,7 @@ class DRCDDataset(BaseDataset):
@staticmethod @staticmethod
def load(path: str): def load(path: str):
with open(path) as f: with open(path, 'r', encoding='utf-8') as f:
data = json.load(f) data = json.load(f)
# 将原始数据转换为所需的格式 # 将原始数据转换为所需的格式
rows = [] rows = []
......
...@@ -13,7 +13,7 @@ class eprstmtDataset_V2(BaseDataset): ...@@ -13,7 +13,7 @@ class eprstmtDataset_V2(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
data = [] data = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
item = { item = {
......
from opencompass.openicl import BaseEvaluator
from opencompass.registry import TEXT_POSTPROCESSORS from opencompass.registry import TEXT_POSTPROCESSORS
...@@ -26,3 +27,25 @@ def gsm8k_postprocess(text: str) -> str: ...@@ -26,3 +27,25 @@ def gsm8k_postprocess(text: str) -> str:
if ret[i].isdigit(): if ret[i].isdigit():
ret1 += ret[i] ret1 += ret[i]
return ret1 return ret1
class Gsm8kEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
correct = 0
count = 0
details = []
for i, j in zip(predictions, references):
detail = {'pred': i, 'answers': j, 'correct': False}
count += 1
if i == j:
correct += 1
detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result
...@@ -49,7 +49,7 @@ class hellaswagDataset_V3(BaseDataset): ...@@ -49,7 +49,7 @@ class hellaswagDataset_V3(BaseDataset):
@staticmethod @staticmethod
def load(path): def load(path):
dataset = [] dataset = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
data = json.loads(line) data = json.loads(line)
dataset.append({ dataset.append({
......
...@@ -148,11 +148,15 @@ class MATHEvaluator(BaseEvaluator): ...@@ -148,11 +148,15 @@ class MATHEvaluator(BaseEvaluator):
} }
correct = 0 correct = 0
count = 0 count = 0
details = []
for i, j in zip(predictions, references): for i, j in zip(predictions, references):
detail = {'pred': i, 'answer': j, 'correct': False}
count += 1 count += 1
if self.is_equiv(i, j): if self.is_equiv(i, j):
correct += 1 correct += 1
result = {'accuracy': 100 * correct / count} detail['correct'] = True
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result return result
def _fix_fracs(self, string): def _fix_fracs(self, string):
......
...@@ -52,9 +52,14 @@ class NQEvaluator(BaseEvaluator): ...@@ -52,9 +52,14 @@ class NQEvaluator(BaseEvaluator):
processed_answers = [[general_postprocess(j).lower() for j in i] processed_answers = [[general_postprocess(j).lower() for j in i]
for i in references] for i in references]
details = []
cnt = 0 cnt = 0
for pred, cand_ans in zip(processed_predictions, processed_answers): for pred, cand_ans in zip(processed_predictions, processed_answers):
detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
cnt += int(any([cand == pred for cand in cand_ans])) cnt += int(any([cand == pred for cand in cand_ans]))
if int(any([cand == pred for cand in cand_ans])):
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100 score = cnt / len(predictions) * 100
return {'score': score} return {'score': score, 'details': details}
...@@ -67,7 +67,7 @@ class TNewsDataset_V2(BaseDataset): ...@@ -67,7 +67,7 @@ class TNewsDataset_V2(BaseDataset):
} }
data = [] data = []
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = json.loads(line) line = json.loads(line)
item = { item = {
......
...@@ -51,9 +51,14 @@ class TriviaQAEvaluator(BaseEvaluator): ...@@ -51,9 +51,14 @@ class TriviaQAEvaluator(BaseEvaluator):
processed_answers = [[general_postprocess(j).lower() for j in i] processed_answers = [[general_postprocess(j).lower() for j in i]
for i in references] for i in references]
details = []
cnt = 0 cnt = 0
for pred, cand_ans in zip(processed_predictions, processed_answers): for pred, cand_ans in zip(processed_predictions, processed_answers):
detail = {'pred': pred, 'answer': cand_ans, 'correct': False}
cnt += int(any([cand == pred for cand in cand_ans])) cnt += int(any([cand == pred for cand in cand_ans]))
if int(any([cand == pred for cand in cand_ans])):
detail['correct'] = True
details.append(detail)
score = cnt / len(predictions) * 100 score = cnt / len(predictions) * 100
return {'score': score} return {'score': score, 'details': details}
...@@ -82,6 +82,20 @@ def strategyqa_pred_postprocess(text: str) -> str: ...@@ -82,6 +82,20 @@ def strategyqa_pred_postprocess(text: str) -> str:
return '' return ''
def flores_postprocess(text: str) -> str:
text = text.strip().split('\n')[-1].strip()
return text
def flores_postprocess_chinese(text: str) -> str:
text = text.strip().split('\n')[-1].strip()
import jieba
truncated_text = text.strip().split('\n')[0]
cleaned_text = re.sub(r'\s+', ' ', truncated_text).strip()
cleaned_text = ' '.join(jieba.cut(cleaned_text))
return cleaned_text
def record_postprocess(text: str) -> str: def record_postprocess(text: str) -> str:
match = re.search(r'(?<=refers to )[^.]+', text) match = re.search(r'(?<=refers to )[^.]+', text)
......
...@@ -24,11 +24,18 @@ class EMEvaluator(BaseEvaluator): ...@@ -24,11 +24,18 @@ class EMEvaluator(BaseEvaluator):
for i in references] for i in references]
cnt = 0 cnt = 0
details = []
for pred, ans, origin_ans in zip(predictions, processed_answers, for pred, ans, origin_ans in zip(predictions, processed_answers,
references): references):
answers = list(set(ans + origin_ans))
detail = {'pred': pred, 'answer': answers}
if pred in ans or pred in origin_ans: if pred in ans or pred in origin_ans:
cnt += 1 cnt += 1
detail['correct'] = True
else:
detail['correct'] = False
details.append(detail)
score = cnt / len(predictions) * 100 score = cnt / len(predictions) * 100
return {'score': score} return {'score': score, 'details': details}
...@@ -51,8 +51,7 @@ class BaseInferencer: ...@@ -51,8 +51,7 @@ class BaseInferencer:
self.output_json_filepath = output_json_filepath self.output_json_filepath = output_json_filepath
self.output_json_filename = output_json_filename self.output_json_filename = output_json_filename
self.is_main_process = is_main_process() self.is_main_process = is_main_process()
if not os.path.exists(self.output_json_filepath): os.makedirs(self.output_json_filepath, exist_ok=True)
os.makedirs(self.output_json_filepath)
def inference(self, def inference(self,
retriever: BaseRetriever, retriever: BaseRetriever,
......
...@@ -94,6 +94,7 @@ class PPLInferencer(BaseInferencer): ...@@ -94,6 +94,7 @@ class PPLInferencer(BaseInferencer):
index = 0 index = 0
prompt_list = [] prompt_list = []
sub_ppl_list = [] sub_ppl_list = []
token_num_list = []
normalizing_prompt_list = [] normalizing_prompt_list = []
context_length_list = [] context_length_list = []
...@@ -144,6 +145,7 @@ class PPLInferencer(BaseInferencer): ...@@ -144,6 +145,7 @@ class PPLInferencer(BaseInferencer):
mode='ppl')) mode='ppl'))
normalizing_prompt_list.append(normalizing_prompt) normalizing_prompt_list.append(normalizing_prompt)
prompt_list.append(prompt) prompt_list.append(prompt)
token_num_list.append(prompt_token_num)
if normalizing_str is not None: if normalizing_str is not None:
normalizing_str_len = self.model.get_token_len_from_template( normalizing_str_len = self.model.get_token_len_from_template(
...@@ -186,6 +188,10 @@ class PPLInferencer(BaseInferencer): ...@@ -186,6 +188,10 @@ class PPLInferencer(BaseInferencer):
ice_str = self.model.parse_template(ice[idx], mode='ppl') ice_str = self.model.parse_template(ice[idx], mode='ppl')
output_handler.save_prompt_and_ppl( output_handler.save_prompt_and_ppl(
label, prompt.replace(ice_str, ''), prompt, res, index) label, prompt.replace(ice_str, ''), prompt, res, index)
output_handler.results_dict[str(
index)][f'label: {str(label)}'][
'BPB'] = res * token_num_list[idx] / len(
prompt.replace(ice_str, '').encode())
index = index + 1 index = index + 1
ppl.append(sub_ppl_list) ppl.append(sub_ppl_list)
......
from abc import abstractmethod from abc import abstractmethod
from copy import deepcopy from copy import deepcopy
from typing import Dict, List from typing import Dict, List, Optional
from mmengine.config import ConfigDict from mmengine.config import ConfigDict
...@@ -13,16 +13,24 @@ class BasePartitioner: ...@@ -13,16 +13,24 @@ class BasePartitioner:
Args: Args:
out_dir (str): The output directory of tasks. out_dir (str): The output directory of tasks.
keep_keys (List[str]): The keys to be kept from the experiment config keep_keys (Optional[List[str]], optional): The keys to be kept from the
to the task config. experiment config to the task config. Defaults to None. If None,
the following keys will be kept:
- eval.runner.task.judge_cfg
- eval.runner.task.dump_details
""" """
def __init__(self, def __init__(self, out_dir: str, keep_keys: Optional[List[str]] = None):
out_dir: str,
keep_keys: List[str] = ['eval.runner.task.judge_cfg']):
self.logger = get_logger() self.logger = get_logger()
self.out_dir = out_dir self.out_dir = out_dir
self.keep_keys = keep_keys if keep_keys is None:
self.keep_keys = [
'eval.runner.task.judge_cfg',
'eval.runner.task.dump_details',
]
else:
self.keep_keys = keep_keys
def __call__(self, cfg: ConfigDict) -> List[Dict]: def __call__(self, cfg: ConfigDict) -> List[Dict]:
"""Generate tasks from config. Each task is defined as a """Generate tasks from config. Each task is defined as a
...@@ -63,7 +71,8 @@ class BasePartitioner: ...@@ -63,7 +71,8 @@ class BasePartitioner:
tgt_ptr = tgt_ptr[key] tgt_ptr = tgt_ptr[key]
tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]] tgt_ptr[key_chain[-1]] = ori_ptr[key_chain[-1]]
except Exception: except Exception:
self.logger.warning(f'Key {k} not found in config, ignored.') self.logger.debug(f'Key {k} not found in config, ignored.')
self.logger.debug(f'Additional config: {add_cfg}')
tasks = self.partition(models, tasks = self.partition(models,
datasets, datasets,
......
import os.path as osp import os.path as osp
from typing import Dict, List from typing import Dict, List, Optional
from mmengine.config import Config, ConfigDict from mmengine.config import Config, ConfigDict
...@@ -11,15 +11,23 @@ from .base import BasePartitioner ...@@ -11,15 +11,23 @@ from .base import BasePartitioner
@PARTITIONERS.register_module() @PARTITIONERS.register_module()
class NaivePartitioner(BasePartitioner): class NaivePartitioner(BasePartitioner):
"""Naive task partitioner. This partitioner will generate a task for each """Naive task partitioner. This partitioner will generate a task for each n
model-dataset pair. model-dataset pairs.
Args: Args:
out_dir (str): The output directory of tasks. out_dir (str): The output directory of tasks.
n (int): The number of model-dataset pairs in each task.
keep_keys (List[str]): The keys to be kept from the experiment config keep_keys (List[str]): The keys to be kept from the experiment config
to the task config. to the task config.
""" """
def __init__(self,
out_dir: str,
n: int = 1,
keep_keys: Optional[List[str]] = None):
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
self.n = n
def partition(self, def partition(self,
models: List[ConfigDict], models: List[ConfigDict],
datasets: List[ConfigDict], datasets: List[ConfigDict],
...@@ -53,13 +61,17 @@ class NaivePartitioner(BasePartitioner): ...@@ -53,13 +61,17 @@ class NaivePartitioner(BasePartitioner):
tasks = [] tasks = []
for model in models: for model in models:
chunks = []
for dataset in datasets: for dataset in datasets:
filename = get_infer_output_path(model, dataset, out_dir) filename = get_infer_output_path(model, dataset, out_dir)
if osp.exists(filename): if osp.exists(filename):
continue continue
chunks.append(dataset)
for i in range(0, len(chunks), self.n):
task = Config({ task = Config({
'models': [model], 'models': [model],
'datasets': [[dataset]], 'datasets': [chunks[i:i + self.n]],
'work_dir': work_dir, 'work_dir': work_dir,
**add_cfg **add_cfg
}) })
......
...@@ -2,7 +2,7 @@ import copy ...@@ -2,7 +2,7 @@ import copy
import math import math
import os.path as osp import os.path as osp
from fnmatch import fnmatch from fnmatch import fnmatch
from typing import Dict, List, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
import mmengine import mmengine
from mmengine.config import Config, ConfigDict from mmengine.config import Config, ConfigDict
...@@ -24,6 +24,11 @@ class SizePartitioner(BasePartitioner): ...@@ -24,6 +24,11 @@ class SizePartitioner(BasePartitioner):
max_task_size (int): The maximum size of a task. max_task_size (int): The maximum size of a task.
gen_task_coef (int): The dataset cost measurement coefficient for gen_task_coef (int): The dataset cost measurement coefficient for
generation tasks. generation tasks.
strategy (str): The partition strategy. Supported strategies are:
'heuristic' and 'split'. Defaults to 'heuristic'.
heuristic: split large datasets into several tasks, merge small
datasets into one task.
split: split large datasets into several tasks only.
dataset_size_path (str): The path to the dataset size cache file. dataset_size_path (str): The path to the dataset size cache file.
keep_keys (list[str]): The keys to be kept from the experiment config keep_keys (list[str]): The keys to be kept from the experiment config
to the task config. to the task config.
...@@ -33,12 +38,17 @@ class SizePartitioner(BasePartitioner): ...@@ -33,12 +38,17 @@ class SizePartitioner(BasePartitioner):
out_dir: str, out_dir: str,
max_task_size: int = 40000, max_task_size: int = 40000,
gen_task_coef: int = 20, gen_task_coef: int = 20,
strategy: str = 'heuristic',
dataset_size_path: str = '.cache/dataset_size.json', dataset_size_path: str = '.cache/dataset_size.json',
keep_keys: List[str] = ['eval.runner.task.judge_cfg']): keep_keys: Optional[List[str]] = None):
super().__init__(out_dir=out_dir, keep_keys=keep_keys) super().__init__(out_dir=out_dir, keep_keys=keep_keys)
self.max_task_size = max_task_size self.max_task_size = max_task_size
self.gen_task_coef = gen_task_coef self.gen_task_coef = gen_task_coef
self.dataset_size_path = dataset_size_path self.dataset_size_path = dataset_size_path
assert strategy in ('heuristic', 'split'), \
f'Unsupported partition strategy: {strategy}. '\
'Supported strategies are: `heuristic`, `split` .'
self.strategy = strategy
def partition(self, def partition(self,
models: List[ConfigDict], models: List[ConfigDict],
...@@ -79,47 +89,47 @@ class SizePartitioner(BasePartitioner): ...@@ -79,47 +89,47 @@ class SizePartitioner(BasePartitioner):
reverse=True) reverse=True)
tasks = [] tasks = []
for model in models: for model in models:
task = Config({ chunks = [] # elements: tuple(size, dataset_chunk)
'models': [model],
'datasets': [[]],
'work_dir': work_dir,
**add_cfg
})
num_data = 0
for dataset in datasets: for dataset in datasets:
filename = get_infer_output_path(model, dataset, out_dir) filename = get_infer_output_path(model, dataset, out_dir)
root, ext = osp.splitext(filename)
# skip the task if the task output exists # skip the task if the task output exists
if osp.exists(filename): if osp.exists(filename):
continue continue
dataset_size = self.get_cost(dataset) dataset_size = self.get_cost(dataset)
if dataset_size > self.max_task_size: if dataset_size > self.max_task_size:
root, ext = osp.splitext(filename)
dataset_splits = self.split_dataset(dataset) dataset_splits = self.split_dataset(dataset)
for i, dataset_split in enumerate(dataset_splits): for i, dataset_split in enumerate(dataset_splits):
# skip the task it the task output exists
if not osp.exists(f'{root}_{i}{ext}'): if not osp.exists(f'{root}_{i}{ext}'):
tasks.append( chunks.append((self.max_task_size, dataset_split))
Config({
'models': [model],
'datasets': [[dataset_split]],
'work_dir': work_dir,
**add_cfg
}))
else: else:
if num_data + dataset_size > self.max_task_size: chunks.append((dataset_size, dataset))
tasks.append(task)
task = Config({ if self.strategy == 'heuristic':
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
current_size, current_chunks = 0, []
for index in range(len(chunks)):
current_size += chunks[index][0]
current_chunks.append(chunks[index][1])
if index == len(chunks) - 1 or current_size + chunks[
index + 1][0] > self.max_task_size:
tasks.append(
Config({
'models': [model],
'datasets': [current_chunks],
'work_dir': work_dir,
**add_cfg
}))
current_size, current_chunks = 0, []
elif self.strategy == 'split':
for _, dataset in chunks:
tasks.append(
Config({
'models': [model], 'models': [model],
'datasets': [[]], 'datasets': [[dataset]],
'work_dir': work_dir, 'work_dir': work_dir,
**add_cfg **add_cfg
}) }))
num_data = 0
task['datasets'][0].append(dataset)
num_data = num_data + dataset_size
if task['datasets'][0]:
tasks.append(task)
return tasks return tasks
@property @property
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment