Commit c94cc943 authored by Leymore's avatar Leymore Committed by gaotong
Browse files

Add release contribution

parent e6b5bdcb
from .openicl_eval import * # noqa: F401, F403
from .openicl_infer import * # noqa: F401, F403
import os
from abc import abstractmethod
from typing import List
from mmengine.config import ConfigDict
from opencompass.utils import get_infer_output_path, task_abbr_from_cfg
class BaseTask:
"""Base class for all tasks. There are two ways to run the task:
1. Directly by calling the `run` method.
2. Calling the `get_command_template` method to get the command template,
and then run the command in the shell.
Args:
cfg (ConfigDict): Config dict.
"""
# The prefix of the task name.
name_prefix: str = None
# The subdirectory of the work directory to store the log files.
log_subdir: str = None
# The subdirectory of the work directory to store the output files.
output_subdir: str = None
def __init__(self, cfg: ConfigDict):
self.cfg = cfg
self.model_cfgs = cfg['models']
self.dataset_cfgs = cfg['datasets']
self.work_dir = cfg['work_dir']
@abstractmethod
def run(self):
"""Run the task."""
@abstractmethod
def get_command_template(self) -> str:
"""Get the command template for the task.
The command template should
contain the following placeholders:
1. ``{SCRIPT_PATH}``: This placeholder will be replaced by the path to
the script file of the task.
2. ``{CFG_PATH}`` This placeholder will be replaced by the
path to the config file of the task.
"""
@property
def name(self) -> str:
return self.name_prefix + task_abbr_from_cfg(
{
'models': self.model_cfgs,
'datasets': self.dataset_cfgs
})
def __repr__(self) -> str:
return f'{self.__class__.__name__}({self.cfg})'
def get_log_path(self, file_extension: str = 'json') -> str:
"""Get the path to the log file.
Args:
file_extension (str): The file extension of the log file.
Default: 'json'.
"""
return get_infer_output_path(
self.model_cfgs[0], self.dataset_cfgs[0][0],
os.path.join(self.work_dir, self.log_subdir), file_extension)
def get_output_paths(self, file_extension: str = 'json') -> List[str]:
"""Get the paths to the output files. Every file should exist if the
task succeeds.
Args:
file_extension (str): The file extension of the output files.
Default: 'json'.
"""
output_paths = []
for model, datasets in zip(self.model_cfgs, self.dataset_cfgs):
for dataset in datasets:
output_paths.append(
get_infer_output_path(
model, dataset,
os.path.join(self.work_dir, self.output_subdir),
file_extension))
return output_paths
from collections import defaultdict
from typing import Dict, List
import mmengine
from mmengine import ConfigDict, track_parallel_progress
from opencompass.registry import EVALUATORS, MODELS
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
@EVALUATORS.register_module()
class ModelEvaluator:
"""TODO: Finish the implementation"""
def __init__(
self,
config: ConfigDict,
) -> None:
self.tasks = []
self.cfg = config
self.parse_cfg(self.cfg.pop('evaluator', ConfigDict({})))
self.dataset_abbrs = [
dataset_abbr_from_cfg(d) for d in self.cfg['datasets']
]
self.model_abbrs = [model_abbr_from_cfg(m) for m in self.cfg['models']]
assert len(self.model_abbrs) > 1
def parse_cfg(self, cfg: ConfigDict):
# The judger
self.judger = MODELS.build(cfg['judger'])
# Maximum number of workers
self.max_num_workers = cfg.get('max_num_workers', 4)
def evaluate(self):
model_scores = defaultdict(int)
all_partial_scores = track_parallel_progress(
self._evaluate_dataset,
self.dataset_abbrs,
nproc=self.max_num_workers,
keep_order=True)
for partial_scores in all_partial_scores:
for model_idx, score in partial_scores.items():
model_scores[self.model_abbrs[model_idx]] += score
print(model_scores)
def _load_dataset(self, dataset_abbr: str):
# for self.
original_datasets = []
self.responses: List[List[str]] = []
self.questions: List[str] = []
for model_abbr in self.model_abbrs:
filename = f'output_model/{model_abbr}/{dataset_abbr}.json'
original_datasets.append(mmengine.load(filename))
for key in original_datasets[-1].keys():
self.questions.append(original_datasets[-1][key]['origin_prompt'])
responses = []
for i in range(len(self.model_abbrs)):
responses.append(original_datasets[i][key]['prediction'])
self.responses.append(responses)
def _evaluate_dataset(self, dataset_abbr: str):
self._load_dataset(dataset_abbr=dataset_abbr)
model_scores = defaultdict(int)
for question, responses in zip(self.questions, self.responses):
prompt = self._make_prompt(question, responses)
print(prompt)
output = self.judger.generate(prompt,
max_out_len=2 *
len(self.model_abbrs))
model_scores = self._rank_models(output, model_scores)
return model_scores
def _make_prompt(self, question: str, responses: List[str]) -> str:
prompt = ('Below are a question and a set of answers, each numbered by'
' a digit. Please sort the answers from least to most '
'appropriate to the question. Only return the digit '
'seperated by a blank space. For example, when there are '
'three answers presented, you should say "1 0 2" when the '
'second answer is the best and the third is the worst.\n'
f'Q: {question}\n')
for i, response in enumerate(responses):
prompt += f'A{i + 1}: {response}\n'
return prompt
def _rank_models(self, output: str,
model_scores: defaultdict) -> Dict[str, int]:
"""Returns model ranking."""
output = output.strip().split(' ')
for score, model_idx in enumerate(output):
model_scores[model_idx] += int(score)
return model_scores
import argparse
import os.path as osp
import time
from typing import Optional
import mmengine
from mmengine.config import Config, ConfigDict
from mmengine.utils import mkdir_or_exist
from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
TEXT_POSTPROCESSORS)
from opencompass.tasks.base import BaseTask
from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path,
get_logger, task_abbr_from_cfg)
@TASKS.register_module(force=(__name__ == '__main__')) # A hack for script run
class OpenICLEvalTask(BaseTask):
"""OpenICL Evaluation Task.
This task is used to evaluate the metric between predictions and
references.
"""
name_prefix = 'OpenICLEval'
log_subdir = 'logs/eval'
output_subdir = 'results'
def __init__(self, cfg: ConfigDict):
super().__init__(cfg)
self.num_gpus = 0
self.logger = get_logger()
def get_command_template(self):
return 'python3 {SCRIPT_PATH} {CFG_PATH}'
def run(self):
for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
for dataset_cfg in dataset_cfgs:
self.model_cfg = model_cfg
self.dataset_cfg = dataset_cfg
# Load Dataset
self.eval_cfg = self.dataset_cfg.get('eval_cfg')
self.output_column = dataset_cfg['reader_cfg']['output_column']
out_path = get_infer_output_path(
self.model_cfg, self.dataset_cfg,
osp.join(self.work_dir, 'results'))
if osp.exists(out_path):
continue
self._score()
def _score(self):
test_set = build_dataset_from_cfg(self.dataset_cfg).test
# Postprocess dataset if necessary
if 'dataset_postprocessor' in self.eval_cfg:
TEXT_POSTPROCESSORS.get(
self.eval_cfg['dataset_postprocessor']['type'])
def postprocess(sample):
s = sample[self.output_column]
proc = TEXT_POSTPROCESSORS.get(
self.eval_cfg['dataset_postprocessor']['type'])
sample[self.output_column] = proc(s)
return sample
test_set = test_set.map(postprocess)
# Load predictions
filename = get_infer_output_path(
self.model_cfg, self.dataset_cfg,
osp.join(self.work_dir, 'predictions'))
# in case the prediction is partial
root, ext = osp.splitext(filename)
partial_filename = root + '_0' + ext
if not osp.exists(osp.realpath(filename)) and not osp.exists(
osp.realpath(partial_filename)):
result = {'error': 'No predictions found.'}
else:
if osp.exists(osp.realpath(filename)):
preds = mmengine.load(filename)
pred_strs = [
preds[str(i)]['prediction'] for i in range(len(preds))
]
else:
filename = partial_filename
pred_strs = []
i = 1
while osp.exists(osp.realpath(filename)):
preds = mmengine.load(filename)
filename = root + f'_{i}' + ext
i += 1
pred_strs += [
preds[str(i)]['prediction'] for i in range(len(preds))
]
if ('pred_role' in self.eval_cfg
and 'meta_template' in self.model_cfg
and not MODELS.get(self.model_cfg['type']).is_api):
# Create a prompt template for role config parsing
from opencompass.models.base import LMTemplateParser
parser = LMTemplateParser(self.model_cfg['meta_template'])
role = parser.roles[self.eval_cfg['pred_role']]
pred_strs = [
self._extract_role_pred(pred, role.get('begin', None),
role.get('end', None))
for pred in pred_strs
]
# Postprocess predictions if necessary
if 'pred_postprocessor' in self.eval_cfg:
proc = TEXT_POSTPROCESSORS.get(
self.eval_cfg['pred_postprocessor']['type'])
pred_strs = [proc(s) for s in pred_strs]
icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
result = icl_evaluator.score(
predictions=pred_strs, references=test_set[self.output_column])
if 'error' in result:
self.logger.error(
f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
return
# Save result
out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg,
osp.join(self.work_dir, 'results'))
mkdir_or_exist(osp.split(out_path)[0])
mmengine.dump(result, out_path)
def _extract_role_pred(self, s: str, begin_str: Optional[str],
end_str: Optional[str]) -> str:
"""Extract the role prediction from the full prediction string. The
role prediction may be the substring between the begin and end string.
Args:
s (str): Full prediction string.
begin_str (str): The beginning string of the role
end_str (str): The ending string of the role.
Returns:
str: The extracted role prediction.
"""
start = 0
end = len(s)
if begin_str:
begin_idx = s.find(begin_str)
if begin_idx != -1:
start = begin_idx + len(begin_str)
if end_str:
# TODO: Support calling tokenizer for the accurate eos token
# and avoid such hardcode
end_idx = s.find(end_str[:1], start)
if end_idx != -1:
end = end_idx
return s[start:end]
def parse_args():
parser = argparse.ArgumentParser(description='Score Calculator')
parser.add_argument('config', help='Config file path')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
cfg = Config.fromfile(args.config)
start_time = time.time()
inferencer = OpenICLEvalTask(cfg)
inferencer.run()
end_time = time.time()
get_logger().info(f'time elapsed: {end_time - start_time:.2f}s')
import os.path as osp
from typing import Dict
from mmengine.config import ConfigDict
def model_abbr_from_cfg(cfg: ConfigDict) -> str:
"""Generate model abbreviation from the model's confg."""
if 'abbr' in cfg:
return cfg['abbr']
model_abbr = cfg['type'] + '_' + '_'.join(
osp.realpath(cfg['path']).split('/')[-2:])
model_abbr = model_abbr.replace('/', '_')
return model_abbr
def dataset_abbr_from_cfg(cfg: ConfigDict) -> str:
"""Returns dataset abbreviation from the dataset's confg."""
if 'abbr' in cfg:
return cfg['abbr']
dataset_abbr = cfg['path']
if 'name' in cfg:
dataset_abbr += '_' + cfg['name']
dataset_abbr = dataset_abbr.replace('/', '_')
return dataset_abbr
def task_abbr_from_cfg(task: Dict) -> str:
"""Returns task abbreviation from the task's confg."""
return '[' + ','.join([
f'{model_abbr_from_cfg(model)}/'
f'{dataset_abbr_from_cfg(dataset)}'
for i, model in enumerate(task['models'])
for dataset in task['datasets'][i]
]) + ']'
def get_infer_output_path(model_cfg: ConfigDict,
dataset_cfg: ConfigDict,
root_path: str = None,
file_extension: str = 'json') -> str:
# TODO: Rename this func
assert root_path is not None, 'default root_path is not allowed any more'
model_abbr = model_abbr_from_cfg(model_cfg)
dataset_abbr = dataset_abbr_from_cfg(dataset_cfg)
return osp.join(root_path, model_abbr, f'{dataset_abbr}.{file_extension}')
import subprocess
def get_git_root() -> str:
cmd = ['git', 'rev-parse', '--show-toplevel']
result = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)
return result.stdout.decode('utf-8').strip()
def get_latest_commit(branch: str) -> str:
cmd = ['git', 'rev-parse', branch]
result = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)
return result.stdout.decode('utf-8').strip()
accelerate>=0.19.0
datasets>=2.12.0
evaluate>=0.3.0
faiss_gpu==1.7.2
nltk==3.8
numpy==1.23.4
openai==0.27.1
rank_bm25==0.2.2
requests==2.28.1
scikit_learn==1.2.1
sentence_transformers==2.2.2
torch>=1.13.1
tqdm==4.64.1
transformers>=4.29.1
openai
mmengine
jieba
pandas<2.0.0
cpm_kernels
tokenizers>=0.13.3
tabulate
fairscale
colossalai
tabulate
boto3
tiktoken
\ No newline at end of file
from setuptools import find_packages, setup
from setuptools.command.install import install
class DownloadNLTK(install):
def run(self):
self.do_egg_install()
import nltk
nltk.download('punkt')
with open('README.md') as f:
readme = f.read()
def do_setup():
setup(
name='opencompass',
version='0.5.0',
description='A comprehensive toolkit for large model evaluation',
# url="",
# author="",
long_description=readme,
long_description_content_type='text/markdown',
cmdclass={'download_nltk': DownloadNLTK},
setup_requires=['nltk==3.8'],
python_requires='>=3.8.0',
packages=find_packages(exclude=[
'test*',
'paper_test*',
]),
keywords=['AI', 'NLP', 'in-context learning'],
classifiers=[
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Intended Audience :: Developers',
'Intended Audience :: Education',
'Intended Audience :: Science/Research',
])
if __name__ == '__main__':
do_setup()
import unittest
from opencompass.utils.prompt import PromptList
class TestPromptList(unittest.TestCase):
def test_initialization(self):
pl = PromptList()
self.assertEqual(pl, [])
pl = PromptList(['test', '123'])
self.assertEqual(pl, ['test', '123'])
def test_format(self):
pl = PromptList(['hi {a}{b}', {'prompt': 'hey {a}!'}, '123'])
new_pl = pl.format(a=1, b=2, c=3)
self.assertEqual(new_pl, ['hi 12', {'prompt': 'hey 1!'}, '123'])
new_pl = pl.format(b=2)
self.assertEqual(new_pl, ['hi {a}2', {'prompt': 'hey {a}!'}, '123'])
new_pl = pl.format(d=1)
self.assertEqual(new_pl, ['hi {a}{b}', {'prompt': 'hey {a}!'}, '123'])
def test_replace(self):
pl = PromptList(['hello world', {'prompt': 'hello world'}, '123'])
new_pl = pl.replace('world', 'there')
self.assertEqual(new_pl,
['hello there', {
'prompt': 'hello there'
}, '123'])
new_pl = pl.replace('123', PromptList(['p', {'role': 'BOT'}]))
self.assertEqual(
new_pl,
['hello world', {
'prompt': 'hello world'
}, 'p', {
'role': 'BOT'
}])
new_pl = pl.replace('2', PromptList(['p', {'role': 'BOT'}]))
self.assertEqual(new_pl, [
'hello world', {
'prompt': 'hello world'
}, '1', 'p', {
'role': 'BOT'
}, '3'
])
with self.assertRaises(TypeError):
new_pl = pl.replace('world', PromptList(['new', 'world']))
def test_add(self):
pl = PromptList(['hello'])
new_pl = pl + ' world'
self.assertEqual(new_pl, ['hello', ' world'])
pl2 = PromptList([' world'])
new_pl = pl + pl2
self.assertEqual(new_pl, ['hello', ' world'])
new_pl = 'hi, ' + pl
self.assertEqual(new_pl, ['hi, ', 'hello'])
pl += '!'
self.assertEqual(pl, ['hello', '!'])
def test_str(self):
pl = PromptList(['hello', ' world', {'prompt': '!'}])
self.assertEqual(str(pl), 'hello world!')
with self.assertRaises(TypeError):
pl = PromptList(['hello', ' world', 123])
str(pl)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment