Unverified Commit 61303941 authored by bittersweet1999's avatar bittersweet1999 Committed by GitHub
Browse files

[Feature] Add double order of subjective evaluation and removing duplicated...

[Feature] Add double order of subjective evaluation and removing duplicated response among two models (#692)

* add features

* add doc string

* add doc string
parent 82a533a6
...@@ -36,7 +36,7 @@ for _name in subjective_all_sets: ...@@ -36,7 +36,7 @@ for _name in subjective_all_sets:
subjective_eval_cfg = dict( subjective_eval_cfg = dict(
evaluator=dict( evaluator=dict(
type=LMEvaluator, type=LMEvaluator,
random_order=True, infer_order='random',
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
template=dict(round=[ template=dict(round=[
......
...@@ -34,7 +34,6 @@ for _name in subjective_all_sets: ...@@ -34,7 +34,6 @@ for _name in subjective_all_sets:
subjective_eval_cfg = dict( subjective_eval_cfg = dict(
evaluator=dict( evaluator=dict(
type=LMEvaluator, type=LMEvaluator,
random_order=True,
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
template=dict(round=[ template=dict(round=[
......
...@@ -83,6 +83,10 @@ summarizer = dict( ...@@ -83,6 +83,10 @@ summarizer = dict(
) )
``` ```
In addition, you can also change the response order of the two models, please refer to `config/subjective_compare.py`,
when `infer_order` is setting to `random`, the response will be random ordered,
when `infer_order` is setting to `double`, the response of two models will be doubled in two ways.
### Single Model Scoring Configuration ### Single Model Scoring Configuration
For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`. For `config/subjective_score.py`, it is mainly same with `config/subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
......
...@@ -83,6 +83,10 @@ summarizer = dict( ...@@ -83,6 +83,10 @@ summarizer = dict(
) )
``` ```
此外,在数据集的配置config中,还可以选择两回答比较时的回答顺序,请参考`config/subjective_compare.py`,
`infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱,
`infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。
### 单回答打分配置 ### 单回答打分配置
对于单回答打分,更详细的config setting请参考 `config/subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore` 对于单回答打分,更详细的config setting请参考 `config/subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore`
......
# flake8: noqa: E501
import os.path as osp import os.path as osp
import random import random
from typing import Dict, List, Optional from typing import Dict, List, Optional
import mmengine import mmengine
from datasets import Dataset
from mmengine.config import ConfigDict from mmengine.config import ConfigDict
from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_inferencer import GenInferencer
...@@ -14,20 +16,40 @@ from opencompass.utils.text_postprocessors import first_number_postprocess ...@@ -14,20 +16,40 @@ from opencompass.utils.text_postprocessors import first_number_postprocess
from opencompass.utils.types import get_type_from_cfg from opencompass.utils.types import get_type_from_cfg
def randomize_preds_and_record_references(predictions, def order_preds_and_record_references(predictions,
references, references,
random_order, infer_order,
seed=2680): seed=2680):
"""Order predictions based on args and recording regrading references.
Args:
predictions (List): List of multi model predictions.
references (List): List of reference based on each problem.
infer_order (str, optional): The mode of inference order.
seed (int, optional): Random seed.
"""
random.seed(seed) random.seed(seed)
list_of_preds = [[] for _ in range(len(predictions))] list_of_preds = [[] for _ in range(len(predictions))]
for i in range(len(predictions[0]['model_preds'])): for i in range(len(predictions[0]['model_preds'])):
preds = [[pred['model_preds'][i], pred['model_name']] preds = [[pred['model_preds'][i], pred['model_name']]
for pred in predictions] for pred in predictions]
if random_order: if infer_order == 'random':
random.shuffle(preds) random.shuffle(preds)
for j in range(len(preds)): for j in range(len(preds)):
list_of_preds[j].append(preds[j][0]) list_of_preds[j].append(preds[j][0])
references[i][f'answer{j+1}'] = preds[j][1] references[i][f'answer{j+1}'] = preds[j][1]
if infer_order == 'double':
assert len(predictions) == 2
list_of_preds = [
a + b for a, b in zip(list_of_preds, reversed(list_of_preds))
]
reversed_references = []
for item in references:
reversed_item = item.copy()
reversed_item['answer1'], reversed_item['answer2'] = reversed_item[
'answer2'], reversed_item['answer1']
reversed_references.append(reversed_item)
references += reversed_references
return list_of_preds, references return list_of_preds, references
...@@ -52,10 +74,11 @@ class LMEvaluator: ...@@ -52,10 +74,11 @@ class LMEvaluator:
prompt_template: ConfigDict, prompt_template: ConfigDict,
judge_cfg: ConfigDict, judge_cfg: ConfigDict,
output_path: str, output_path: str,
random_order: Optional[bool] = False, infer_order: Optional[str] = 'random',
dataset_cfg: Optional[ConfigDict] = None, dataset_cfg: Optional[ConfigDict] = None,
postprocessor: ConfigDict = dict(type=first_number_postprocess) postprocessor: ConfigDict = dict(type=first_number_postprocess)
) -> None: ) -> None:
assert infer_order in ['random', 'double']
self.output_path = output_path self.output_path = output_path
out_dir, out_name = osp.split(output_path) out_dir, out_name = osp.split(output_path)
if not out_dir: if not out_dir:
...@@ -74,20 +97,36 @@ class LMEvaluator: ...@@ -74,20 +97,36 @@ class LMEvaluator:
self.postprocessor = get_type_from_cfg(postprocessor) self.postprocessor = get_type_from_cfg(postprocessor)
self.logger = get_logger() self.logger = get_logger()
self.dataset_cfg = dataset_cfg self.dataset_cfg = dataset_cfg
self.random_order = random_order self.infer_order = infer_order
def score(self, predictions, references: Optional[List] = None) -> Dict: def score(self, predictions, references: Optional[List] = None) -> Dict:
if type(predictions) == list: if type(predictions) == list:
"""Apply to multi-model comparison.""" """Apply to multi-model comparison."""
references = [{} for _ in range(len(predictions[0]['model_preds'])) references = [{} for _ in range(len(predictions[0]['model_preds']))
] if references is None else references ] if references is None else references
predictions, references = randomize_preds_and_record_references( predictions, references = order_preds_and_record_references(
predictions, references, self.random_order) predictions, references, self.infer_order)
elif type(predictions) == dict: elif type(predictions) == dict:
"""Apply to single-model scoring.""" """Apply to single-model scoring."""
references = [{} for _ in range(len(predictions[0]['model_preds'])) references = [{} for _ in range(len(predictions[0]['model_preds']))
] if references is None else references ] if references is None else references
predictions = [predictions['model_preds']] predictions = [predictions['model_preds']]
# calculate dupicated predictions numbers
total_predictions_num = len(predictions[0])
dup_indices = []
for i in range(len(predictions[0])):
check = [sub[i] for sub in predictions]
if len(set(check)) == 1:
dup_indices.append(i)
if len(dup_indices) != 0:
# remove dupicated predictions
for index in sorted(dup_indices, reverse=True):
for sublist in predictions:
del sublist[index]
del references[index]
pred_dict = {} pred_dict = {}
for i in range(len(predictions)): for i in range(len(predictions)):
key = 'prediction' if i == 0 else f'prediction{i + 1}' key = 'prediction' if i == 0 else f'prediction{i + 1}'
...@@ -95,6 +134,25 @@ class LMEvaluator: ...@@ -95,6 +134,25 @@ class LMEvaluator:
if self.dataset_cfg: if self.dataset_cfg:
dataset = build_dataset_from_cfg(self.dataset_cfg) dataset = build_dataset_from_cfg(self.dataset_cfg)
if self.infer_order == 'double':
new_ds = {
k: dataset.test[k] * 2
for k in dataset.test.column_names
}
dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
if len(dup_indices) != 0:
remaining_indices = [
idx for idx in range(len(dataset.test))
if idx not in dup_indices
]
dataset.reader.dataset['test'] = dataset.test.select(
remaining_indices)
print(
f'Among total {total_predictions_num} predictions, there are {len(dup_indices)} predictions totally same, which are removed!'
)
for k, v in pred_dict.items(): for k, v in pred_dict.items():
dataset.reader.dataset['test'] = dataset.test.add_column(k, v) dataset.reader.dataset['test'] = dataset.test.add_column(k, v)
dataset.reader.input_columns.append(k) dataset.reader.input_columns.append(k)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment