Add release contribution

c94cc943 · Leymore · gaotong · e6b5bdcb · c94cc943 · c94cc943
Commit c94cc943 authored Jul 05, 2023 by Leymore Committed by gaotong Jul 05, 2023
9 changed files
--- a/opencompass/tasks/__init__.py
+++ b/opencompass/tasks/__init__.py
+from .openicl_eval import *  # noqa: F401, F403
+from .openicl_infer import *  # noqa: F401, F403
--- a/opencompass/tasks/base.py
+++ b/opencompass/tasks/base.py
+import os
+from abc import abstractmethod
+from typing import List
+from mmengine.config import ConfigDict
+from opencompass.utils import get_infer_output_path, task_abbr_from_cfg
+class BaseTask:
+    """Base class for all tasks. There are two ways to run the task:
+    1. Directly by calling the `run` method.
+    2. Calling the `get_command_template` method to get the command template,
+        and then run the command in the shell.
+    Args:
+        cfg (ConfigDict): Config dict.
+    """
+    # The prefix of the task name.
+    name_prefix: str = None
+    # The subdirectory of the work directory to store the log files.
+    log_subdir: str = None
+    # The subdirectory of the work directory to store the output files.
+    output_subdir: str = None
+    def __init__(self, cfg: ConfigDict):
+        self.cfg = cfg
+        self.model_cfgs = cfg['models']
+        self.dataset_cfgs = cfg['datasets']
+        self.work_dir = cfg['work_dir']
+    @abstractmethod
+    def run(self):
+        """Run the task."""
+    @abstractmethod
+    def get_command_template(self) -> str:
+        """Get the command template for the task.
+        The command template should
+        contain the following placeholders:
+        1. ``{SCRIPT_PATH}``: This placeholder will be replaced by the path to
+            the script file of the task.
+        2. ``{CFG_PATH}`` This placeholder will be replaced by the
+            path to the config file of the task.
+        """
+    @property
+    def name(self) -> str:
+        return self.name_prefix + task_abbr_from_cfg(
+            {
+                'models': self.model_cfgs,
+                'datasets': self.dataset_cfgs
+            })
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}({self.cfg})'
+    def get_log_path(self, file_extension: str = 'json') -> str:
+        """Get the path to the log file.
+        Args:
+            file_extension (str): The file extension of the log file.
+                Default: 'json'.
+        """
+        return get_infer_output_path(
+            self.model_cfgs[0], self.dataset_cfgs[0][0],
+            os.path.join(self.work_dir, self.log_subdir), file_extension)
+    def get_output_paths(self, file_extension: str = 'json') -> List[str]:
+        """Get the paths to the output files. Every file should exist if the
+        task succeeds.
+        Args:
+            file_extension (str): The file extension of the output files.
+                Default: 'json'.
+        """
+        output_paths = []
+        for model, datasets in zip(self.model_cfgs, self.dataset_cfgs):
+            for dataset in datasets:
+                output_paths.append(
+                    get_infer_output_path(
+                        model, dataset,
+                        os.path.join(self.work_dir, self.output_subdir),
+                        file_extension))
+        return output_paths
--- a/opencompass/tasks/llm_eval.py
+++ b/opencompass/tasks/llm_eval.py
+from collections import defaultdict
+from typing import Dict, List
+import mmengine
+from mmengine import ConfigDict, track_parallel_progress
+from opencompass.registry import EVALUATORS, MODELS
+from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
+@EVALUATORS.register_module()
+class ModelEvaluator:
+    """TODO: Finish the implementation"""
+    def __init__(
+        self,
+        config: ConfigDict,
+    ) -> None:
+        self.tasks = []
+        self.cfg = config
+        self.parse_cfg(self.cfg.pop('evaluator', ConfigDict({})))
+        self.dataset_abbrs = [
+            dataset_abbr_from_cfg(d) for d in self.cfg['datasets']
+        ]
+        self.model_abbrs = [model_abbr_from_cfg(m) for m in self.cfg['models']]
+        assert len(self.model_abbrs) > 1
+    def parse_cfg(self, cfg: ConfigDict):
+        # The judger
+        self.judger = MODELS.build(cfg['judger'])
+        # Maximum number of workers
+        self.max_num_workers = cfg.get('max_num_workers', 4)
+    def evaluate(self):
+        model_scores = defaultdict(int)
+        all_partial_scores = track_parallel_progress(
+            self._evaluate_dataset,
+            self.dataset_abbrs,
+            nproc=self.max_num_workers,
+            keep_order=True)
+        for partial_scores in all_partial_scores:
+            for model_idx, score in partial_scores.items():
+                model_scores[self.model_abbrs[model_idx]] += score
+        print(model_scores)
+    def _load_dataset(self, dataset_abbr: str):
+        # for self.
+        original_datasets = []
+        self.responses: List[List[str]] = []
+        self.questions: List[str] = []
+        for model_abbr in self.model_abbrs:
+            filename = f'output_model/{model_abbr}/{dataset_abbr}.json'
+            original_datasets.append(mmengine.load(filename))
+        for key in original_datasets[-1].keys():
+            self.questions.append(original_datasets[-1][key]['origin_prompt'])
+            responses = []
+            for i in range(len(self.model_abbrs)):
+                responses.append(original_datasets[i][key]['prediction'])
+            self.responses.append(responses)
+    def _evaluate_dataset(self, dataset_abbr: str):
+        self._load_dataset(dataset_abbr=dataset_abbr)
+        model_scores = defaultdict(int)
+        for question, responses in zip(self.questions, self.responses):
+            prompt = self._make_prompt(question, responses)
+            print(prompt)
+            output = self.judger.generate(prompt,
+                                          max_out_len=2 *
+                                          len(self.model_abbrs))
+            model_scores = self._rank_models(output, model_scores)
+        return model_scores
+    def _make_prompt(self, question: str, responses: List[str]) -> str:
+        prompt = ('Below are a question and a set of answers, each numbered by'
+                  ' a digit. Please sort the answers from least to most '
+                  'appropriate to the question. Only return the digit '
+                  'seperated by a blank space. For example, when there are '
+                  'three answers presented, you should say "1 0 2" when the '
+                  'second answer is the best and the third is the worst.\n'
+                  f'Q: {question}\n')
+        for i, response in enumerate(responses):
+            prompt += f'A{i + 1}: {response}\n'
+        return prompt
+    def _rank_models(self, output: str,
+                     model_scores: defaultdict) -> Dict[str, int]:
+        """Returns model ranking."""
+        output = output.strip().split(' ')
+        for score, model_idx in enumerate(output):
+            model_scores[model_idx] += int(score)
+        return model_scores
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
+import argparse
+import os.path as osp
+import time
+from typing import Optional
+import mmengine
+from mmengine.config import Config, ConfigDict
+from mmengine.utils import mkdir_or_exist
+from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
+                                  TEXT_POSTPROCESSORS)
+from opencompass.tasks.base import BaseTask
+from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path,
+                               get_logger, task_abbr_from_cfg)
+@TASKS.register_module(force=(__name__ == '__main__'))  # A hack for script run
+class OpenICLEvalTask(BaseTask):
+    """OpenICL Evaluation Task.
+    This task is used to evaluate the metric between predictions and
+    references.
+    """
+    name_prefix = 'OpenICLEval'
+    log_subdir = 'logs/eval'
+    output_subdir = 'results'
+    def __init__(self, cfg: ConfigDict):
+        super().__init__(cfg)
+        self.num_gpus = 0
+        self.logger = get_logger()
+    def get_command_template(self):
+        return 'python3 {SCRIPT_PATH} {CFG_PATH}'
+    def run(self):
+        for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
+            for dataset_cfg in dataset_cfgs:
+                self.model_cfg = model_cfg
+                self.dataset_cfg = dataset_cfg
+                # Load Dataset
+                self.eval_cfg = self.dataset_cfg.get('eval_cfg')
+                self.output_column = dataset_cfg['reader_cfg']['output_column']
+                out_path = get_infer_output_path(
+                    self.model_cfg, self.dataset_cfg,
+                    osp.join(self.work_dir, 'results'))
+                if osp.exists(out_path):
+                    continue
+                self._score()
+    def _score(self):
+        test_set = build_dataset_from_cfg(self.dataset_cfg).test
+        # Postprocess dataset if necessary
+        if 'dataset_postprocessor' in self.eval_cfg:
+            TEXT_POSTPROCESSORS.get(
+                self.eval_cfg['dataset_postprocessor']['type'])
+            def postprocess(sample):
+                s = sample[self.output_column]
+                proc = TEXT_POSTPROCESSORS.get(
+                    self.eval_cfg['dataset_postprocessor']['type'])
+                sample[self.output_column] = proc(s)
+                return sample
+            test_set = test_set.map(postprocess)
+        # Load predictions
+        filename = get_infer_output_path(
+            self.model_cfg, self.dataset_cfg,
+            osp.join(self.work_dir, 'predictions'))
+        # in case the prediction is partial
+        root, ext = osp.splitext(filename)
+        partial_filename = root + '_0' + ext
+        if not osp.exists(osp.realpath(filename)) and not osp.exists(
+                osp.realpath(partial_filename)):
+            result = {'error': 'No predictions found.'}
+        else:
+            if osp.exists(osp.realpath(filename)):
+                preds = mmengine.load(filename)
+                pred_strs = [
+                    preds[str(i)]['prediction'] for i in range(len(preds))
+                ]
+            else:
+                filename = partial_filename
+                pred_strs = []
+                i = 1
+                while osp.exists(osp.realpath(filename)):
+                    preds = mmengine.load(filename)
+                    filename = root + f'_{i}' + ext
+                    i += 1
+                    pred_strs += [
+                        preds[str(i)]['prediction'] for i in range(len(preds))
+                    ]
+            if ('pred_role' in self.eval_cfg
+                    and 'meta_template' in self.model_cfg
+                    and not MODELS.get(self.model_cfg['type']).is_api):
+                # Create a prompt template for role config parsing
+                from opencompass.models.base import LMTemplateParser
+                parser = LMTemplateParser(self.model_cfg['meta_template'])
+                role = parser.roles[self.eval_cfg['pred_role']]
+                pred_strs = [
+                    self._extract_role_pred(pred, role.get('begin', None),
+                                            role.get('end', None))
+                    for pred in pred_strs
+                ]
+            # Postprocess predictions if necessary
+            if 'pred_postprocessor' in self.eval_cfg:
+                proc = TEXT_POSTPROCESSORS.get(
+                    self.eval_cfg['pred_postprocessor']['type'])
+                pred_strs = [proc(s) for s in pred_strs]
+            icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
+            result = icl_evaluator.score(
+                predictions=pred_strs, references=test_set[self.output_column])
+        if 'error' in result:
+            self.logger.error(
+                f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
+            return
+        # Save result
+        out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg,
+                                         osp.join(self.work_dir, 'results'))
+        mkdir_or_exist(osp.split(out_path)[0])
+        mmengine.dump(result, out_path)
+    def _extract_role_pred(self, s: str, begin_str: Optional[str],
+                           end_str: Optional[str]) -> str:
+        """Extract the role prediction from the full prediction string. The
+        role prediction may be the substring between the begin and end string.
+        Args:
+            s (str): Full prediction string.
+            begin_str (str): The beginning string of the role
+            end_str (str): The ending string of the role.
+        Returns:
+            str: The extracted role prediction.
+        """
+        start = 0
+        end = len(s)
+        if begin_str:
+            begin_idx = s.find(begin_str)
+            if begin_idx != -1:
+                start = begin_idx + len(begin_str)
+        if end_str:
+            # TODO: Support calling tokenizer for the accurate eos token
+            # and avoid such hardcode
+            end_idx = s.find(end_str[:1], start)
+            if end_idx != -1:
+                end = end_idx
+        return s[start:end]
+def parse_args():
+    parser = argparse.ArgumentParser(description='Score Calculator')
+    parser.add_argument('config', help='Config file path')
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    start_time = time.time()
+    inferencer = OpenICLEvalTask(cfg)
+    inferencer.run()
+    end_time = time.time()
+    get_logger().info(f'time elapsed: {end_time - start_time:.2f}s')
--- a/opencompass/utils/abbr.py
+++ b/opencompass/utils/abbr.py
+import os.path as osp
+from typing import Dict
+from mmengine.config import ConfigDict
+def model_abbr_from_cfg(cfg: ConfigDict) -> str:
+    """Generate model abbreviation from the model's confg."""
+    if 'abbr' in cfg:
+        return cfg['abbr']
+    model_abbr = cfg['type'] + '_' + '_'.join(
+        osp.realpath(cfg['path']).split('/')[-2:])
+    model_abbr = model_abbr.replace('/', '_')
+    return model_abbr
+def dataset_abbr_from_cfg(cfg: ConfigDict) -> str:
+    """Returns dataset abbreviation from the dataset's confg."""
+    if 'abbr' in cfg:
+        return cfg['abbr']
+    dataset_abbr = cfg['path']
+    if 'name' in cfg:
+        dataset_abbr += '_' + cfg['name']
+    dataset_abbr = dataset_abbr.replace('/', '_')
+    return dataset_abbr
+def task_abbr_from_cfg(task: Dict) -> str:
+    """Returns task abbreviation from the task's confg."""
+    return '[' + ','.join([
+        f'{model_abbr_from_cfg(model)}/'
+        f'{dataset_abbr_from_cfg(dataset)}'
+        for i, model in enumerate(task['models'])
+        for dataset in task['datasets'][i]
+    ]) + ']'
+def get_infer_output_path(model_cfg: ConfigDict,
+                          dataset_cfg: ConfigDict,
+                          root_path: str = None,
+                          file_extension: str = 'json') -> str:
+    # TODO: Rename this func
+    assert root_path is not None, 'default root_path is not allowed any more'
+    model_abbr = model_abbr_from_cfg(model_cfg)
+    dataset_abbr = dataset_abbr_from_cfg(dataset_cfg)
+    return osp.join(root_path, model_abbr, f'{dataset_abbr}.{file_extension}')
--- a/opencompass/utils/git.py
+++ b/opencompass/utils/git.py
+import subprocess
+def get_git_root() -> str:
+    cmd = ['git', 'rev-parse', '--show-toplevel']
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)
+    return result.stdout.decode('utf-8').strip()
+def get_latest_commit(branch: str) -> str:
+    cmd = ['git', 'rev-parse', branch]
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)
+    return result.stdout.decode('utf-8').strip()
--- a/requirements.txt
+++ b/requirements.txt
+accelerate>=0.19.0
+datasets>=2.12.0
+evaluate>=0.3.0
+faiss_gpu==1.7.2
+nltk==3.8
+numpy==1.23.4
+openai==0.27.1
+rank_bm25==0.2.2
+requests==2.28.1
+scikit_learn==1.2.1
+sentence_transformers==2.2.2
+torch>=1.13.1
+tqdm==4.64.1
+transformers>=4.29.1
+openai
+mmengine
+jieba
+pandas<2.0.0
+cpm_kernels
+tokenizers>=0.13.3
+tabulate
+fairscale
+colossalai
+tabulate
+boto3
+tiktoken
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
+from setuptools import find_packages, setup
+from setuptools.command.install import install
+class DownloadNLTK(install):
+    def run(self):
+        self.do_egg_install()
+        import nltk
+        nltk.download('punkt')
+with open('README.md') as f:
+    readme = f.read()
+def do_setup():
+    setup(
+        name='opencompass',
+        version='0.5.0',
+        description='A comprehensive toolkit for large model evaluation',
+        # url="",
+        # author="",
+        long_description=readme,
+        long_description_content_type='text/markdown',
+        cmdclass={'download_nltk': DownloadNLTK},
+        setup_requires=['nltk==3.8'],
+        python_requires='>=3.8.0',
+        packages=find_packages(exclude=[
+            'test*',
+            'paper_test*',
+        ]),
+        keywords=['AI', 'NLP', 'in-context learning'],
+        classifiers=[
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+            'Programming Language :: Python :: 3.10',
+            'Intended Audience :: Developers',
+            'Intended Audience :: Education',
+            'Intended Audience :: Science/Research',
+        ])
+if __name__ == '__main__':
+    do_setup()
--- a/tests/prompt/test_prompt_list.py
+++ b/tests/prompt/test_prompt_list.py
+import unittest
+from opencompass.utils.prompt import PromptList
+class TestPromptList(unittest.TestCase):
+    def test_initialization(self):
+        pl = PromptList()
+        self.assertEqual(pl, [])
+        pl = PromptList(['test', '123'])
+        self.assertEqual(pl, ['test', '123'])
+    def test_format(self):
+        pl = PromptList(['hi {a}{b}', {'prompt': 'hey {a}!'}, '123'])
+        new_pl = pl.format(a=1, b=2, c=3)
+        self.assertEqual(new_pl, ['hi 12', {'prompt': 'hey 1!'}, '123'])
+        new_pl = pl.format(b=2)
+        self.assertEqual(new_pl, ['hi {a}2', {'prompt': 'hey {a}!'}, '123'])
+        new_pl = pl.format(d=1)
+        self.assertEqual(new_pl, ['hi {a}{b}', {'prompt': 'hey {a}!'}, '123'])
+    def test_replace(self):
+        pl = PromptList(['hello world', {'prompt': 'hello world'}, '123'])
+        new_pl = pl.replace('world', 'there')
+        self.assertEqual(new_pl,
+                         ['hello there', {
+                             'prompt': 'hello there'
+                         }, '123'])
+        new_pl = pl.replace('123', PromptList(['p', {'role': 'BOT'}]))
+        self.assertEqual(
+            new_pl,
+            ['hello world', {
+                'prompt': 'hello world'
+            }, 'p', {
+                'role': 'BOT'
+            }])
+        new_pl = pl.replace('2', PromptList(['p', {'role': 'BOT'}]))
+        self.assertEqual(new_pl, [
+            'hello world', {
+                'prompt': 'hello world'
+            }, '1', 'p', {
+                'role': 'BOT'
+            }, '3'
+        ])
+        with self.assertRaises(TypeError):
+            new_pl = pl.replace('world', PromptList(['new', 'world']))
+    def test_add(self):
+        pl = PromptList(['hello'])
+        new_pl = pl + ' world'
+        self.assertEqual(new_pl, ['hello', ' world'])
+        pl2 = PromptList([' world'])
+        new_pl = pl + pl2
+        self.assertEqual(new_pl, ['hello', ' world'])
+        new_pl = 'hi, ' + pl
+        self.assertEqual(new_pl, ['hi, ', 'hello'])
+        pl += '!'
+        self.assertEqual(pl, ['hello', '!'])
+    def test_str(self):
+        pl = PromptList(['hello', ' world', {'prompt': '!'}])
+        self.assertEqual(str(pl), 'hello world!')
+        with self.assertRaises(TypeError):
+            pl = PromptList(['hello', ' world', 123])
+            str(pl)