[Feature] Add other judgelm prompts for Alignbench (#731)

* add judgellm prompts * add judgelm prompts * update import info * fix situation that no abbr in config * fix situation that no abbr in config * add summarizer for other judgellm * change config name * add maxlen * add maxlen * dict assert * dict assert * fix strings * fix strings

[Feature] Add other judgelm prompts for Alignbench (#731)
* add judgellm prompts * add judgelm prompts * update import info * fix situation that no abbr in config * fix situation that no abbr in config * add summarizer for other judgellm * change config name * add maxlen * add maxlen * dict assert * dict assert * fix strings * fix strings
dfd9ac0f · bittersweet1999 · GitHub · 54345c56 · dfd9ac0f · dfd9ac0f
Unverified Commit dfd9ac0f authored Dec 27, 2023 by bittersweet1999 Committed by GitHub Dec 27, 2023
3 changed files
--- a/opencompass/summarizers/corev2.py
+++ b/opencompass/summarizers/corev2.py
@@ -16,7 +16,7 @@ except ImportError:
    from_csv = None
 from opencompass.partitioners.sub_naive import remove_duplicate_pairs
-from opencompass.utils import dataset_abbr_from_cfg
+from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
 def match_general_answer(s):
@@ -58,7 +58,7 @@ class Corev2Summarizer:
        self.match_method = match_method
        self.base_models = self.cfg['eval']['partitioner']['base_models']
        self.compare_models = self.cfg['eval']['partitioner']['compare_models']
-        self.judge_abbr = self.cfg['judge_model']['abbr']
+        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
    def summarize(self,
                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):

--- a/opencompass/summarizers/utils.py
+++ b/opencompass/summarizers/utils.py
+# flake8: noqa: E501
+import os.path as osp
+import mmengine
+from opencompass.utils import dataset_abbr_from_cfg
+def get_outdir(cfg, time_str):
+    """Get out put path.
+    Args:
+        cfg (ConfigDict): The running config.
+        time_str (str): Current time.
+    """
+    work_dir = cfg['work_dir']
+    output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt')
+    output_dir = osp.join(osp.split(output_path)[0], f'{time_str}')
+    mmengine.mkdir_or_exist(output_dir)
+    results_folder = osp.join(work_dir, 'results')
+    return output_dir, results_folder
+def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
+    """Extract judgements (scores) and references.
+    Args:
+        dataset (ConfigDict): Dataset config.
+        subdir_path (str): Model path in results dir.
+        post_process (function): The pre-defined extract function.
+    """
+    dataset_abbr = dataset_abbr_from_cfg(dataset)
+    filename = osp.join(subdir_path, dataset_abbr + '.json')
+    partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json')
+    if osp.exists(osp.realpath(filename)):
+        result = mmengine.load(filename)
+    elif osp.exists(osp.realpath(partial_filename)):
+        filename = partial_filename
+        result = {}
+        i = 1
+        partial_dict_flag = 0
+        while osp.exists(osp.realpath(filename)):
+            res = mmengine.load(filename)
+            for k, v in res.items():
+                result[partial_dict_flag] = v
+                partial_dict_flag += 1
+            filename = osp.join(subdir_path,
+                                dataset_abbr + '_' + str(i) + '.json')
+            i += 1
+    else:
+        result = {}
+    if len(result) == 0:
+        print('*' * 100)
+        print('There are no results for ' + filename + ' or ' +
+              partial_filename)
+        print('*' * 100)
+        assert len(result) > 0
+    judged_answers = []
+    references = []
+    for k, v in result.items():
+        processed_judge = post_process(v['prediction'])
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            references.append(v['gold'])
+    print(
+        f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements.'
+    )
+    if len(judged_answers) == 0:
+        print('*' * 100)
+        print(
+            'There are no extracted judgements, please change your judge model or check your prompt!!!'
+        )
+        print('*' * 100)
+    assert len(judged_answers) > 0
+    return judged_answers, references
--- a/opencompass/tasks/subjective_eval.py
+++ b/opencompass/tasks/subjective_eval.py
@@ -14,7 +14,7 @@ from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS
 from opencompass.tasks.base import BaseTask
 from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
                               get_infer_output_path, get_logger,
-                               task_abbr_from_cfg)
+                               model_abbr_from_cfg, task_abbr_from_cfg)
 class SubjectiveEvalTask(BaseTask):
@@ -35,6 +35,16 @@ class SubjectiveEvalTask(BaseTask):
        super().__init__(cfg)
        self.logger = get_logger()
        judge_cfg = cfg.eval.runner.task.get('judge_cfg', {})
+        if type(judge_cfg) != ConfigDict:
+            print('*' * 100)
+            print('Due to different Judge model needs different summarizer and'
+                  " prompts, we don't support multi judge model evaluation at "
+                  'one time, please do not use list to set your judge cfg, jus'
+                  't use a dict or list[0] should be fine. If you want to eval'
+                  'uation multi judge model in one script, we suggest you to u'
+                  'se a bash or bat script to start multi configs evaluation!')
+            print('*' * 100)
+        assert type(judge_cfg) == ConfigDict
        run_cfg = judge_cfg.get('run_cfg', {})
        self.num_gpus = run_cfg.get('num_gpus', 0)
        self.num_procs = run_cfg.get('num_procs', 1)
@@ -63,16 +73,14 @@ class SubjectiveEvalTask(BaseTask):
        # model_cfg can be a list of model configs
        for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
            for dataset_cfg in dataset_cfgs:
-                # self.model_cfg = model_cfg
-                # self.dataset_cfg = dataset_cfg
                # Load Dataset
                eval_cfg = dataset_cfg.get('eval_cfg')
                output_column = dataset_cfg['reader_cfg']['output_column']
                if type(model_cfg) == ConfigDict:
                    model_cfg = (model_cfg, )
                model_cfg += ({
-                    'abbr': 'judged-by--' + self.judge_cfg['abbr']
+                    'abbr':
+                    'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
                }, )
                out_path = get_infer_output_path(
                    model_cfg, dataset_cfg, osp.join(self.work_dir, 'results'))
@@ -142,7 +150,10 @@ class SubjectiveEvalTask(BaseTask):
                kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
                proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
                pred_strs = [proc(s, **kwargs) for s in pred_strs]
-        return {'model_name': model_cfg['abbr'], 'model_preds': pred_strs}
+        return {
+            'model_name': model_abbr_from_cfg(model_cfg),
+            'model_preds': pred_strs
+        }
    def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column):
        test_set = build_dataset_from_cfg(dataset_cfg).test
@@ -241,7 +252,10 @@ class SubjectiveEvalTask(BaseTask):
            for dataset in datasets:
                if type(model) == ConfigDict:
                    model = (model, )
-                model += ({'abbr': 'judged-by--' + self.judge_cfg['abbr']}, )
+                model += ({
+                    'abbr':
+                    'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
+                }, )
                output_paths.append(
                    get_infer_output_path(
                        model, dataset,