Unverified Commit dfd9ac0f authored by bittersweet1999's avatar bittersweet1999 Committed by GitHub
Browse files

[Feature] Add other judgelm prompts for Alignbench (#731)

* add judgellm prompts

* add judgelm prompts

* update import info

* fix situation that no abbr in config

* fix situation that no abbr in config

* add summarizer for other judgellm

* change config name

* add maxlen

* add maxlen

* dict assert

* dict assert

* fix strings

* fix strings
parent 54345c56
...@@ -16,7 +16,7 @@ except ImportError: ...@@ -16,7 +16,7 @@ except ImportError:
from_csv = None from_csv = None
from opencompass.partitioners.sub_naive import remove_duplicate_pairs from opencompass.partitioners.sub_naive import remove_duplicate_pairs
from opencompass.utils import dataset_abbr_from_cfg from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
def match_general_answer(s): def match_general_answer(s):
...@@ -58,7 +58,7 @@ class Corev2Summarizer: ...@@ -58,7 +58,7 @@ class Corev2Summarizer:
self.match_method = match_method self.match_method = match_method
self.base_models = self.cfg['eval']['partitioner']['base_models'] self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models'] self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.judge_abbr = self.cfg['judge_model']['abbr'] self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
def summarize(self, def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
......
# flake8: noqa: E501
import os.path as osp
import mmengine
from opencompass.utils import dataset_abbr_from_cfg
def get_outdir(cfg, time_str):
"""Get out put path.
Args:
cfg (ConfigDict): The running config.
time_str (str): Current time.
"""
work_dir = cfg['work_dir']
output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt')
output_dir = osp.join(osp.split(output_path)[0], f'{time_str}')
mmengine.mkdir_or_exist(output_dir)
results_folder = osp.join(work_dir, 'results')
return output_dir, results_folder
def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
"""Extract judgements (scores) and references.
Args:
dataset (ConfigDict): Dataset config.
subdir_path (str): Model path in results dir.
post_process (function): The pre-defined extract function.
"""
dataset_abbr = dataset_abbr_from_cfg(dataset)
filename = osp.join(subdir_path, dataset_abbr + '.json')
partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json')
if osp.exists(osp.realpath(filename)):
result = mmengine.load(filename)
elif osp.exists(osp.realpath(partial_filename)):
filename = partial_filename
result = {}
i = 1
partial_dict_flag = 0
while osp.exists(osp.realpath(filename)):
res = mmengine.load(filename)
for k, v in res.items():
result[partial_dict_flag] = v
partial_dict_flag += 1
filename = osp.join(subdir_path,
dataset_abbr + '_' + str(i) + '.json')
i += 1
else:
result = {}
if len(result) == 0:
print('*' * 100)
print('There are no results for ' + filename + ' or ' +
partial_filename)
print('*' * 100)
assert len(result) > 0
judged_answers = []
references = []
for k, v in result.items():
processed_judge = post_process(v['prediction'])
if processed_judge is not None:
judged_answers.append(processed_judge)
references.append(v['gold'])
print(
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.'
)
if len(judged_answers) == 0:
print('*' * 100)
print(
'There are no extracted judgements, please change your judge model or check your prompt!!!'
)
print('*' * 100)
assert len(judged_answers) > 0
return judged_answers, references
...@@ -14,7 +14,7 @@ from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS ...@@ -14,7 +14,7 @@ from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS
from opencompass.tasks.base import BaseTask from opencompass.tasks.base import BaseTask
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg, from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
get_infer_output_path, get_logger, get_infer_output_path, get_logger,
task_abbr_from_cfg) model_abbr_from_cfg, task_abbr_from_cfg)
class SubjectiveEvalTask(BaseTask): class SubjectiveEvalTask(BaseTask):
...@@ -35,6 +35,16 @@ class SubjectiveEvalTask(BaseTask): ...@@ -35,6 +35,16 @@ class SubjectiveEvalTask(BaseTask):
super().__init__(cfg) super().__init__(cfg)
self.logger = get_logger() self.logger = get_logger()
judge_cfg = cfg.eval.runner.task.get('judge_cfg', {}) judge_cfg = cfg.eval.runner.task.get('judge_cfg', {})
if type(judge_cfg) != ConfigDict:
print('*' * 100)
print('Due to different Judge model needs different summarizer and'
" prompts, we don't support multi judge model evaluation at "
'one time, please do not use list to set your judge cfg, jus'
't use a dict or list[0] should be fine. If you want to eval'
'uation multi judge model in one script, we suggest you to u'
'se a bash or bat script to start multi configs evaluation!')
print('*' * 100)
assert type(judge_cfg) == ConfigDict
run_cfg = judge_cfg.get('run_cfg', {}) run_cfg = judge_cfg.get('run_cfg', {})
self.num_gpus = run_cfg.get('num_gpus', 0) self.num_gpus = run_cfg.get('num_gpus', 0)
self.num_procs = run_cfg.get('num_procs', 1) self.num_procs = run_cfg.get('num_procs', 1)
...@@ -63,16 +73,14 @@ class SubjectiveEvalTask(BaseTask): ...@@ -63,16 +73,14 @@ class SubjectiveEvalTask(BaseTask):
# model_cfg can be a list of model configs # model_cfg can be a list of model configs
for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs): for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
for dataset_cfg in dataset_cfgs: for dataset_cfg in dataset_cfgs:
# self.model_cfg = model_cfg
# self.dataset_cfg = dataset_cfg
# Load Dataset # Load Dataset
eval_cfg = dataset_cfg.get('eval_cfg') eval_cfg = dataset_cfg.get('eval_cfg')
output_column = dataset_cfg['reader_cfg']['output_column'] output_column = dataset_cfg['reader_cfg']['output_column']
if type(model_cfg) == ConfigDict: if type(model_cfg) == ConfigDict:
model_cfg = (model_cfg, ) model_cfg = (model_cfg, )
model_cfg += ({ model_cfg += ({
'abbr': 'judged-by--' + self.judge_cfg['abbr'] 'abbr':
'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
}, ) }, )
out_path = get_infer_output_path( out_path = get_infer_output_path(
model_cfg, dataset_cfg, osp.join(self.work_dir, 'results')) model_cfg, dataset_cfg, osp.join(self.work_dir, 'results'))
...@@ -142,7 +150,10 @@ class SubjectiveEvalTask(BaseTask): ...@@ -142,7 +150,10 @@ class SubjectiveEvalTask(BaseTask):
kwargs = pred_postprocessor or eval_cfg['pred_postprocessor'] kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type')) proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
pred_strs = [proc(s, **kwargs) for s in pred_strs] pred_strs = [proc(s, **kwargs) for s in pred_strs]
return {'model_name': model_cfg['abbr'], 'model_preds': pred_strs} return {
'model_name': model_abbr_from_cfg(model_cfg),
'model_preds': pred_strs
}
def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column): def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column):
test_set = build_dataset_from_cfg(dataset_cfg).test test_set = build_dataset_from_cfg(dataset_cfg).test
...@@ -241,7 +252,10 @@ class SubjectiveEvalTask(BaseTask): ...@@ -241,7 +252,10 @@ class SubjectiveEvalTask(BaseTask):
for dataset in datasets: for dataset in datasets:
if type(model) == ConfigDict: if type(model) == ConfigDict:
model = (model, ) model = (model, )
model += ({'abbr': 'judged-by--' + self.judge_cfg['abbr']}, ) model += ({
'abbr':
'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
}, )
output_paths.append( output_paths.append(
get_infer_output_path( get_infer_output_path(
model, dataset, model, dataset,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment