Unverified Commit 8c85edd1 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] deprecate old mbpps (#1064)

parent c1724013
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
## pass@1 ## pass@1
如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py)[configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。 如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py)[configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。
如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md) 如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md)
...@@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator ...@@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with read_base(): with read_base():
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
mbpp_datasets[0]['type'] = MBPPDataset_V2 mbpp_datasets[0]['type'] = MBPPDataset_V2
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
...@@ -64,7 +64,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator ...@@ -64,7 +64,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with read_base(): with read_base():
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
humaneval_datasets[0]['num_repeats'] = 10 humaneval_datasets[0]['num_repeats'] = 10
......
...@@ -56,6 +56,12 @@ def parse_args(): ...@@ -56,6 +56,12 @@ def parse_args():
'to run', 'to run',
action='store_true', action='store_true',
default=False) default=False)
parser.add_argument(
'--accelerator',
help='Infer accelerator, support vllm and lmdeploy now.',
choices=['vllm', 'lmdeploy', 'hg'],
default='hg',
type=str)
parser.add_argument('-m', parser.add_argument('-m',
'--mode', '--mode',
help='Running mode. You can choose "infer" if you ' help='Running mode. You can choose "infer" if you '
......
...@@ -27,11 +27,9 @@ except ImportError: ...@@ -27,11 +27,9 @@ except ImportError:
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10 TIMEOUT = 10
...@@ -321,7 +319,7 @@ def timeout_handler(signum, frame): ...@@ -321,7 +319,7 @@ def timeout_handler(signum, frame):
try: try:
signal.signal(signal.SIGALRM, timeout_handler) signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError: except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform') print('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds timeout = 4 # seconds
......
...@@ -134,11 +134,20 @@ class MBPPPlusDataset(BaseDataset): ...@@ -134,11 +134,20 @@ class MBPPPlusDataset(BaseDataset):
multiple responses in special cases. multiple responses in special cases.
""" """
def processing_test(example):
example['test_case'] = example['test_list']
example['test_list'] = '\n'.join(example['test_list'])
example['test_list_2'] = example['test_list']
example['test_column'] = dict(test_list_2=example['test_list'],
task_id=example['task_id'])
return example
dataset = [] dataset = []
with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
dataset.extend( example = json.loads(line.strip())
[json.loads(line.strip()) for _ in range(num_repeats)]) example = processing_test(example)
dataset.extend([example for _ in range(num_repeats)])
return Dataset.from_list(dataset) return Dataset.from_list(dataset)
...@@ -211,7 +220,7 @@ class MBPPEvaluator(BaseEvaluator): ...@@ -211,7 +220,7 @@ class MBPPEvaluator(BaseEvaluator):
predictions)): predictions)):
pred = self._process_answer(pred) pred = self._process_answer(pred)
programs = self._process_test(refer, pred) programs = self._process_test(refer, pred)
future = executor.submit(execution, programs, i, 3) future = executor.submit(execution, programs, i, 10)
futures.append(future) futures.append(future)
details[str(i)] = {} details[str(i)] = {}
details[str(i)]['origin'] = predictions[i] details[str(i)]['origin'] = predictions[i]
...@@ -262,39 +271,34 @@ class MBPPEvaluator(BaseEvaluator): ...@@ -262,39 +271,34 @@ class MBPPEvaluator(BaseEvaluator):
return {f'mbpp_plus_{k}': score[k] * 100 for k in score} return {f'mbpp_plus_{k}': score[k] * 100 for k in score}
def _process_answer(self, text): def _process_answer(self, text):
try: patterns = [
# for chatGLM related text r"\[BEGIN\]\s*'(.*)'\s*\[DONE\]",
eval_text = eval(text) r"BEGIN\s*'(.*)'\s*\[DONE\]",
except Exception: r"\[BEGIN\]\s*'(.*)'\s*DONE",
pass r"BEGIN\s*'(.*)'\s*DONE",
else: r"\[BEGIN\]\s*'(.*)\s*\[DONE\]",
if isinstance(eval_text, str): r"BEGIN\s*'(.*)\s*\[DONE\]",
text = eval_text r"\[BEGIN\]\s*'(.*)\s*DONE",
# deal with code block r"BEGIN\s*'(.*)\s*DONE",
if '```' in text: r'\[BEGIN\]\s*(.*)\s*\[DONE\]',
blocks = re.findall(r'```(.*?)```', text, re.DOTALL) r'BEGIN\s*(.*)\s*\[DONE\]',
if len(blocks) == 0: r'\[BEGIN\]\s*(.*)\s*DONE',
text = text.split('```')[1] # fall back to default strategy r'BEGIN\s*(.*)\s*DONE',
else: r'```python\s*(.*)\s*```',
text = blocks[0] # fetch the first code block r'```\s*(.*)\s*```',
if not text.startswith('\n'): # in case starting with ```xxx r'(.*)\s*```.*',
text = text[max(text.find('\n') + 1, 0):] r"\[BEGIN\]\s*'(.*)",
text = text.strip() r'\[BEGIN\](.*)',
match = re.search(r"('\s*|)(\[DONE\]|DONE)", text) ]
if match: for p in patterns:
text = text[:match.start()] match = re.search(p, text, re.DOTALL)
match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text) if match:
if match: text = match.group(1)
text = text[match.end():] break
text = text.split('```')[0]
text = re.split(r"'?\s*\[?DONE\]?", text)[0]
text = text.replace('\\_', '_')
text = text.strip() text = text.strip()
if text.startswith("'"):
text = text[1:]
if text.endswith("'"):
text = text[:-1]
text = text.replace('\\', '')
match = re.search(r'```python(.*)```', text, re.DOTALL)
if match:
text = match.group(1).strip().split('```')[0].strip()
return text return text
def _process_test(self, test_case, pred): def _process_test(self, test_case, pred):
...@@ -451,7 +455,7 @@ class MBPPPassKEvaluator(MBPPEvaluator): ...@@ -451,7 +455,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
for pred in preds: for pred in preds:
pred = self._process_answer(pred) pred = self._process_answer(pred)
programs = self._process_test(test_case, pred) programs = self._process_test(test_case, pred)
future = executor.submit(execution, programs, task_id, 3) future = executor.submit(execution, programs, task_id, 10)
futures.append(future) futures.append(future)
from tqdm import tqdm from tqdm import tqdm
......
...@@ -27,11 +27,9 @@ except ImportError: ...@@ -27,11 +27,9 @@ except ImportError:
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10 TIMEOUT = 10
...@@ -267,7 +265,7 @@ def timeout_handler(signum, frame): ...@@ -267,7 +265,7 @@ def timeout_handler(signum, frame):
try: try:
signal.signal(signal.SIGALRM, timeout_handler) signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError: except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform') print('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds timeout = 4 # seconds
......
...@@ -84,7 +84,12 @@ class OpenAI(BaseAPIModel): ...@@ -84,7 +84,12 @@ class OpenAI(BaseAPIModel):
self.top_logprobs = top_logprobs self.top_logprobs = top_logprobs
if isinstance(key, str): if isinstance(key, str):
self.keys = [os.getenv('OPENAI_API_KEY') if key == 'ENV' else key] if key == 'ENV':
if 'OPENAI_API_KEY' not in os.environ:
raise ValueError('OpenAI API key is not set.')
self.keys = os.getenv('OPENAI_API_KEY').split(',')
else:
self.keys = [key]
else: else:
self.keys = key self.keys = key
...@@ -101,12 +106,11 @@ class OpenAI(BaseAPIModel): ...@@ -101,12 +106,11 @@ class OpenAI(BaseAPIModel):
self.url = openai_api_base self.url = openai_api_base
self.path = path self.path = path
def generate( def generate(self,
self, inputs: List[PromptType],
inputs: List[PromptType], max_out_len: int = 512,
max_out_len: int = 512, temperature: float = 0.7,
temperature: float = 0.7, **kwargs) -> List[str]:
) -> List[str]:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
Args: Args:
...@@ -412,9 +416,15 @@ class OpenAIAllesAPIN(OpenAI): ...@@ -412,9 +416,15 @@ class OpenAIAllesAPIN(OpenAI):
} }
for _ in range(self.retry): for _ in range(self.retry):
self.wait() self.wait()
raw_response = requests.post(self.url, try:
headers=self.headers, raw_response = requests.post(self.url,
data=json.dumps(data)) headers=self.headers,
data=json.dumps(data))
except requests.ConnectionError:
self.logger.error('Request error, got',
str(raw_response.content))
time.sleep(1)
continue
try: try:
response = raw_response.json() response = raw_response.json()
except requests.JSONDecodeError: except requests.JSONDecodeError:
......
...@@ -161,7 +161,7 @@ class Qwen(BaseAPIModel): ...@@ -161,7 +161,7 @@ class Qwen(BaseAPIModel):
time.sleep(1) time.sleep(1)
continue continue
if response.status_code == 429: if response.status_code == 429:
print('Rate limited') print(response)
time.sleep(2) time.sleep(2)
continue continue
if response.status_code == 400: if response.status_code == 400:
......
...@@ -214,6 +214,16 @@ class DLCRunner(BaseRunner): ...@@ -214,6 +214,16 @@ class DLCRunner(BaseRunner):
pod_create_time = None pod_create_time = None
pri_time = None pri_time = None
initial_time = datetime.datetime.now() initial_time = datetime.datetime.now()
url = 'http://pai-console.cb210e3f99cd7403f8de2a630dcc99fc3.cn-wulanchabu.alicontainer.com' # noqa: E501
logger = get_logger()
logger.debug('')
logger.debug('*' * 168)
logger.debug(
f'{url}/index?workspaceId={self.aliyun_cfg["workspace_id"]}#/dlc2/job/{job_id}/detail' # noqa: E501
)
logger.debug('*' * 168)
while True: while True:
# 1. Avoid to request dlc too frequently. # 1. Avoid to request dlc too frequently.
# 2. DLC job may not be ready immediately after creation. # 2. DLC job may not be ready immediately after creation.
......
...@@ -188,6 +188,7 @@ class SlurmSequentialRunner(BaseRunner): ...@@ -188,6 +188,7 @@ class SlurmSequentialRunner(BaseRunner):
tmpl += f' --gres=gpu:{num_gpus}' tmpl += f' --gres=gpu:{num_gpus}'
for extra_cmd in self.extra_command: for extra_cmd in self.extra_command:
tmpl += f' {extra_cmd}' tmpl += f' {extra_cmd}'
tmpl += ' -x HOST-10-140-60-7'
tmpl += f" -N1 -u -J '{task_name[:512]}'" + ' {task_cmd}' tmpl += f" -N1 -u -J '{task_name[:512]}'" + ' {task_cmd}'
get_cmd = partial(task.get_command, get_cmd = partial(task.get_command,
cfg_path=param_file, cfg_path=param_file,
......
...@@ -72,7 +72,7 @@ dataset_mapping_dict = {} ...@@ -72,7 +72,7 @@ dataset_mapping_dict = {}
needle_counts = ['2', '3', '4', '5'] needle_counts = ['2', '3', '4', '5']
languages = ['en', 'zh'] languages = ['en', 'zh']
sizes = ['4k', '8k', '32k', '200k', '1000k'] sizes = ['4k', '8k', '32k', '200k', '256k', '1000k']
types = ['origin', 'parallel'] types = ['origin', 'parallel']
for needle_count in needle_counts: for needle_count in needle_counts:
...@@ -190,7 +190,7 @@ def save_results_to_plots(txt_results_save_path): ...@@ -190,7 +190,7 @@ def save_results_to_plots(txt_results_save_path):
numbers = [2, 3, 4, 5] numbers = [2, 3, 4, 5]
languages = ['en', 'zh'] languages = ['en', 'zh']
size_exists = [] size_exists = []
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_1000k'] sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_256k', '_1000k']
for size in sizes_origin: for size in sizes_origin:
if size in content: if size in content:
...@@ -301,6 +301,9 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str): ...@@ -301,6 +301,9 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
markersize=8, markersize=8,
label='Average Depth Score' label='Average Depth Score'
) )
for x_value, y_value in zip(x_data, y_data):
ax2.text(x_value, y_value, f'{y_value:.2f}', ha='center', va='top')
ax2.set_ylim(0, 100) ax2.set_ylim(0, 100)
ax2.set_yticklabels([]) ax2.set_yticklabels([])
...@@ -353,7 +356,7 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str): ...@@ -353,7 +356,7 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
new_save_path = os.path.join(directory_path, new_filename) new_save_path = os.path.join(directory_path, new_filename)
plt.savefig(new_save_path, format='png', bbox_inches='tight', pad_inches=0) plt.savefig(new_save_path, format='png', bbox_inches='tight', pad_inches=0)
print(f'Saved :{new_save_path}') print(f'Saved: {new_save_path}')
plt.close() plt.close()
......
# flake8: noqa: E501 # flake8: noqa
import ast # yapf: disable
import csv
import os import os
import os.path as osp import os.path as osp
import re import re
...@@ -10,7 +9,7 @@ from itertools import product ...@@ -10,7 +9,7 @@ from itertools import product
import mmengine import mmengine
from mmengine import ConfigDict from mmengine import ConfigDict
from prettytable import from_csv from tabulate import tabulate
from opencompass.partitioners.sub_naive import remove_duplicate_pairs from opencompass.partitioners.sub_naive import remove_duplicate_pairs
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
...@@ -18,6 +17,12 @@ from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg ...@@ -18,6 +17,12 @@ from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
from .utils import get_judgeanswer_and_reference, get_outdir from .utils import get_judgeanswer_and_reference, get_outdir
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
def post_process_compass_arena(s): def post_process_compass_arena(s):
if result := re.findall('(?:选择:|Choice: )([ABC])', s): if result := re.findall('(?:选择:|Choice: )([ABC])', s):
return result[0] return result[0]
...@@ -68,17 +73,90 @@ class CompassArenaSummarizer: ...@@ -68,17 +73,90 @@ class CompassArenaSummarizer:
self.base_models = self.cfg['eval']['partitioner']['base_models'] self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models'] self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.judge_models = self.cfg.get('judge_models', None) self.judge_models = self.cfg.get('judge_models', None)
self.meta_judge_model = self.cfg.eval.partitioner.get( self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
'meta_judge_model', None)
self.judge_type = judge_type self.judge_type = judge_type
assert self.judge_type in ['general'] assert self.judge_type in ['general']
self.judge_map = { self.judge_map = {'general': post_process_compass_arena}
'general': post_process_compass_arena,
}
self.judge_function = self.judge_map[self.judge_type] self.judge_function = self.judge_map[self.judge_type]
self.check_pos_bias = check_pos_bias self.check_pos_bias = check_pos_bias
self.summary_type = summary_type self.summary_type = summary_type
def get_score(self, time_str):
output_dir, results_folder = get_outdir(self.cfg, time_str)
model_combinations = list(product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
if self.meta_judge_model is not None:
self.judge_models.append(self.meta_judge_model)
scores = {}
for idx, judge_model_cfg in enumerate(self.judge_models):
judge_model = model_abbr_from_cfg(judge_model_cfg)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
for model_pair in unique_combinations:
model1 = model_pair[0]['abbr']
model2 = model_pair[1]['abbr']
if idx == len(self.judge_models):
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
else:
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
subdir_path = os.path.join(results_folder, subdir)
if not os.path.isdir(subdir_path):
print(subdir_path + ' is not exist! please check!')
continue
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
if self.check_pos_bias:
bias_num = check_position_bias(judged_answers, references)
else:
bias_num = 0
win_model1 = defaultdict(float)
win_model2 = defaultdict(float)
categories = defaultdict(float)
model1 = references[0]['answer1']
model2 = references[0]['answer2']
for prediction, reference in zip(judged_answers, references):
categories[dataset_abbr] += 1
categories[reference['capability']] += 1
if prediction == 'A':
if reference['answer1'] == model1:
score_1, score_2 = 1, 0
else:
score_1, score_2 = 0, 1
elif prediction == 'B':
if reference['answer1'] == model1:
score_1, score_2 = 0, 1
else:
score_1, score_2 = 1, 0
elif prediction == 'C':
if self.summary_type == 'half_add':
score_1, score_2 = 0.5, 0.5
else:
score_1, score_2 = 0, 0
win_model1[reference['capability']] += score_1
win_model1[dataset_abbr] += score_1
win_model2[reference['capability']] += score_2
win_model2[dataset_abbr] += score_2
for capability in categories:
win_model1[capability] = win_model1[capability] / categories[capability] * 100
win_model1[capability] = round(win_model1[capability], 2)
win_model2[capability] = win_model2[capability] / categories[capability] * 100
win_model2[capability] = round(win_model2[capability], 2)
win_model1['position_bias'] = bias_num
win_model2['position_bias'] = bias_num
if judge_model not in scores:
scores[judge_model] = {}
if dataset_abbr not in scores[judge_model]:
scores[judge_model][dataset_abbr] = {}
scores[judge_model][dataset_abbr][model2] = win_model2
return scores
def summarize( def summarize(
self, self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'), time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
...@@ -91,143 +169,72 @@ class CompassArenaSummarizer: ...@@ -91,143 +169,72 @@ class CompassArenaSummarizer:
Returns: Returns:
pd.DataFrame: The summary results. pd.DataFrame: The summary results.
""" """
dataset_cfgs = self.cfg['datasets']
scores = self.get_score(time_str)
# scores['win_' + model1] = win_model1
output_dir, results_folder = get_outdir(self.cfg, time_str) output_dir, results_folder = get_outdir(self.cfg, time_str)
model_combinations = list(
product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs(
[combo for combo in model_combinations if combo[0] != combo[1]])
fout_list = []
pre_len = len(self.judge_models)
if self.meta_judge_model is not None:
self.judge_models.append(self.meta_judge_model)
meta_judge_model_abbr = model_abbr_from_cfg(self.meta_judge_model)
else:
meta_judge_model_abbr = None
for idx, judge_model in enumerate(self.judge_models): for idx, judge_model in enumerate(self.judge_models):
judge_model = model_abbr_from_cfg(judge_model) judge_abbr = model_abbr_from_cfg(judge_model)
for dataset in dataset_cfgs: for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset) dataset_abbr = dataset_abbr_from_cfg(dataset)
if idx == pre_len: summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
fout = osp.join( one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
output_dir, 'summarized-by--' + judge_model + '-' + row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']]
dataset_abbr + '-report.csv') row_headers = [dataset_abbr, 'position_bias'] + row_headers
headers = [''] + summarizer_model_abbrs
table = []
for row_header in row_headers:
row = [row_header]
for model_cfg in self.compare_models:
model_abbr = model_abbr_from_cfg(model_cfg)
s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '')
if isinstance(s, float):
s = f'{s:.2f}'
if isinstance(s, int):
s = str(s)
row.append(s)
table.append(row)
txt = tabulate(table, headers=headers)
print(txt)
if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
else: else:
fout = osp.join( output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
output_dir, 'judged-by--' + judge_model + '-' +
dataset_abbr + '-report.csv') with open(output_filename, 'w') as f:
fout_list.append(fout) f.write(','.join(headers) + '\n')
for model_pair in unique_combinations: for line in table:
model1, model2, = model_pair[0]['abbr'], model_pair[1][ f.write(','.join(line) + '\n')
'abbr'], print(output_filename)
if idx == pre_len:
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model table = []
else: summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model headers = [''] + summarizer_model_abbrs
subdir_path = os.path.join(results_folder, subdir) for dataset in self.cfg['datasets']:
if os.path.isdir(subdir_path): dataset_abbr = dataset_abbr_from_cfg(dataset)
judged_answers, references = get_judgeanswer_and_reference( row = [dataset_abbr]
dataset, for model_cfg in self.compare_models:
subdir_path, model_abbr = model_abbr_from_cfg(model_cfg)
self.judge_function, s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '')
) if isinstance(s, float):
if self.check_pos_bias: s = f'{s:.2f}'
bias_num = check_position_bias( if isinstance(s, int):
judged_answers, references) s = str(s)
else: row.append(s)
bias_num = 0 table.append(row)
win_model1, win_model2, categories = defaultdict( txt = tabulate(table, headers=headers)
float), defaultdict(float), defaultdict(float) print(txt)
model1, model2 = references[0]['answer1'], references[
0]['answer2'] if idx == len(self.judge_models):
for prediction, reference in zip( output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv')
judged_answers, references): else:
if self.summary_type == 'single': output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv')
if prediction == 'A': with open(output_filename, 'w') as f:
categories['total'] += 1 f.write(','.join(headers) + '\n')
categories[reference['capability']] += 1 for line in table:
if reference['answer1'] == model1: f.write(','.join(line) + '\n')
win_model1[ print(output_filename)
reference['capability']] += 1
win_model1['total'] += 1
else:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
elif prediction == 'B':
categories['total'] += 1
categories[reference['capability']] += 1
if reference['answer1'] == model1:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
else:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
elif self.summary_type == 'half_add':
categories['total'] += 1
categories[reference['capability']] += 1
if prediction == 'A':
if reference['answer1'] == model1:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
else:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
elif prediction == 'B':
if reference['answer1'] == model1:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
else:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
elif prediction == 'C':
win_model1[reference['capability']] += 0.5
win_model1['total'] += 0.5
win_model2[reference['capability']] += 0.5
win_model2['total'] += 0.5
for capability in categories:
if capability not in win_model1:
win_model1[capability] = 0.0
else:
win_model1[capability] = round(
(win_model1[capability] /
categories[capability]) * 100, 2)
if capability not in win_model2:
win_model2[capability] = 0.0
else:
win_model2[capability] = round(
(win_model2[capability] /
categories[capability]) * 100, 2)
win_model1['position_bias'] = bias_num
win_model2['position_bias'] = bias_num
scores = {
'win_' + model1: win_model1,
'win_' + model2: win_model2
}
rows = list(scores.keys())
columns = list(scores[rows[0]].keys())
columns.insert(0, columns.pop(columns.index('total')))
columns.insert(
1, columns.pop(columns.index('position_bias')))
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([model1 + '_vs_' + model2] +
columns)
for row in rows:
writer.writerow([row] + [
scores[row][column] for column in columns
])
else:
print(subdir_path + ' is not exist! please check!')
for fout in fout_list:
with open(fout, 'r') as f:
x = from_csv(f)
print(fout)
print(x)
# flake8: noqa: E501 # flake8: noqa
# yapf: disable
import csv import csv
import os import os
import os.path as osp import os.path as osp
...@@ -8,11 +9,7 @@ from datetime import datetime ...@@ -8,11 +9,7 @@ from datetime import datetime
import numpy as np import numpy as np
from mmengine import ConfigDict from mmengine import ConfigDict
from tabulate import tabulate
try:
from prettytable import from_csv
except ImportError:
from_csv = None
from opencompass.utils import model_abbr_from_cfg from opencompass.utils import model_abbr_from_cfg
...@@ -20,6 +17,12 @@ from .compass_arena import CompassArenaSummarizer ...@@ -20,6 +17,12 @@ from .compass_arena import CompassArenaSummarizer
from .utils import get_judgeanswer_and_reference, get_outdir from .utils import get_judgeanswer_and_reference, get_outdir
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
def post_process_mtbench_pair(judgement: str): def post_process_mtbench_pair(judgement: str):
"""Input a string like below: """Input a string like below:
...@@ -52,7 +55,7 @@ def get_capability_results( ...@@ -52,7 +55,7 @@ def get_capability_results(
references, references,
fout, fout,
fout_flag, fout_flag,
model, model_abbr,
): ):
capability_ratings = defaultdict(int) capability_ratings = defaultdict(int)
capability_counts = defaultdict(int) capability_counts = defaultdict(int)
...@@ -70,12 +73,12 @@ def get_capability_results( ...@@ -70,12 +73,12 @@ def get_capability_results(
capability_avg_ratings[capability] = s capability_avg_ratings[capability] = s
columns = list(capability_avg_ratings.keys()) columns = list(capability_avg_ratings.keys())
columns.insert(0, columns.pop(columns.index('total'))) columns.insert(0, columns.pop(columns.index('total')))
with open(fout, 'a+', newline='') as csvfile: with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile) writer = csv.writer(csvfile)
if fout_flag == 0: if fout_flag == 0:
writer.writerow(['model'] + columns) writer.writerow(['model'] + columns)
writer.writerow([model] + writer.writerow([model_abbr] + [capability_avg_ratings[column] for column in columns])
[capability_avg_ratings[column] for column in columns])
class MTBenchSummarizer(CompassArenaSummarizer): class MTBenchSummarizer(CompassArenaSummarizer):
...@@ -92,13 +95,9 @@ class MTBenchSummarizer(CompassArenaSummarizer): ...@@ -92,13 +95,9 @@ class MTBenchSummarizer(CompassArenaSummarizer):
self.cfg = config self.cfg = config
if self.judge_type == 'single': if self.judge_type == 'single':
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]
elif self.judge_type == 'pair': elif self.judge_type == 'pair':
self.base_models = self.cfg['eval']['partitioner']['base_models'] self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner'][ self.compare_models = self.cfg['eval']['partitioner']['compare_models']
'compare_models']
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
self.judge_map = { self.judge_map = {
'single': post_process_mtbench_single, 'single': post_process_mtbench_single,
...@@ -106,8 +105,7 @@ class MTBenchSummarizer(CompassArenaSummarizer): ...@@ -106,8 +105,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
} }
self.judge_function = self.judge_map[self.judge_type] self.judge_function = self.judge_map[self.judge_type]
def summarize(self, def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results. """Summarize the subjectivity analysis based on evaluation results.
Args: Args:
...@@ -116,33 +114,40 @@ class MTBenchSummarizer(CompassArenaSummarizer): ...@@ -116,33 +114,40 @@ class MTBenchSummarizer(CompassArenaSummarizer):
Returns: Returns:
pd.DataFrame: The summary results. pd.DataFrame: The summary results.
""" """
if self.judge_type == 'single': if self.judge_type == 'pair':
dataset_cfgs = self.cfg['datasets'] return super().summarize()
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0 # self.judge_type == 'single'
for eval_model_abbr in self.eval_model_abbrs: dataset_cfgs = self.cfg['datasets']
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr output_dir, results_folder = get_outdir(self.cfg, time_str)
subdir_path = os.path.join(results_folder, subdir) fout_flag = 0
if os.path.isdir(subdir_path): for eval_model_cfg in self.eval_model_cfgs:
model, judge_model = eval_model_abbr, self.judge_abbr eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
fout = osp.join( show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
output_dir, subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
'judged-by--' + judge_model + '-capability.csv') if os.path.isdir(subdir_path):
overall_judged_answers, overall_references = [], [] fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability.csv')
for dataset in dataset_cfgs: overall_judged_answers, overall_references = [], []
judged_answers, references = get_judgeanswer_and_reference( for dataset in dataset_cfgs:
dataset, subdir_path, self.judge_function) judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
overall_judged_answers += judged_answers overall_judged_answers += judged_answers
overall_references += references overall_references += references
get_capability_results(overall_judged_answers, get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
overall_references, fout, fout_flag, fout_flag += 1
model) else:
fout_flag += 1 print(subdir_path + ' is not exist! please check!')
else: with open(fout, 'r') as f:
print(subdir_path + ' is not exist! please check!') csv_reader = csv.reader(f)
with open(fout, 'r') as f: header = next(csv_reader)
x = from_csv(f) table = [line for line in csv_reader]
print(x)
print(fout) new_header = [''] + [line[0] for line in table]
elif self.judge_type == 'pair': new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
super().summarize() new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
t = tabulate(new_table, headers=new_header)
with open(fout, 'w') as f:
f.write(','.join(new_header) + '\n')
for line in new_table:
f.write(','.join(map(str, line)) + '\n')
print(t)
print(fout)
...@@ -3,6 +3,7 @@ import copy ...@@ -3,6 +3,7 @@ import copy
import fnmatch import fnmatch
import math import math
import os.path as osp import os.path as osp
import re
import statistics import statistics
import time import time
from collections import Counter from collections import Counter
...@@ -38,12 +39,12 @@ def extract_role_pred(s: str, begin_str: Optional[str], ...@@ -38,12 +39,12 @@ def extract_role_pred(s: str, begin_str: Optional[str],
start = 0 start = 0
end = len(s) end = len(s)
if begin_str: if begin_str and re.match(r'\s*', begin_str) is None:
begin_idx = s.find(begin_str) begin_idx = s.find(begin_str)
if begin_idx != -1: if begin_idx != -1:
start = begin_idx + len(begin_str) start = begin_idx + len(begin_str)
if end_str: if end_str and re.match(r'\s*', end_str) is None:
# TODO: Support calling tokenizer for the accurate eos token # TODO: Support calling tokenizer for the accurate eos token
# and avoid such hardcode # and avoid such hardcode
end_idx = s.find(end_str, start) end_idx = s.find(end_str, start)
......
# flake8: noqa: E501 # flake8: noqa: E501
import copy import copy
import json import json
import os
import os.path as osp import os.path as osp
import mmengine import mmengine
...@@ -123,6 +124,10 @@ class AlpacaEvalTask(BaseTask): ...@@ -123,6 +124,10 @@ class AlpacaEvalTask(BaseTask):
command = '' command = ''
if api_key is not None: if api_key is not None:
command += f'export OPENAI_API_KEY={api_key}; ' command += f'export OPENAI_API_KEY={api_key}; '
else:
api_key = os.environ.get('OPENAI_API_KEY', '').split(',')[0]
if api_key:
command += f'export OPENAI_API_KEY={api_key}; '
command += f'alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}' command += f'alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}'
return template.format(task_cmd=command) return template.format(task_cmd=command)
......
...@@ -5,6 +5,7 @@ import tabulate ...@@ -5,6 +5,7 @@ import tabulate
from mmengine.config import Config from mmengine.config import Config
from opencompass.datasets.custom import make_custom_dataset_config from opencompass.datasets.custom import make_custom_dataset_config
from opencompass.models import VLLM, HuggingFaceCausalLM, TurboMindModel
from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
...@@ -72,6 +73,10 @@ def get_config_from_arg(args) -> Config: ...@@ -72,6 +73,10 @@ def get_config_from_arg(args) -> Config:
if args.config: if args.config:
config = Config.fromfile(args.config, format_python_code=False) config = Config.fromfile(args.config, format_python_code=False)
config = try_fill_in_custom_cfgs(config) config = try_fill_in_custom_cfgs(config)
# set infer accelerator if needed
if args.accelerator in ['vllm', 'lmdeploy']:
config['models'] = change_accelerator(config['models'],
args.accelerator)
return config return config
# parse dataset args # parse dataset args
if not args.datasets and not args.custom_dataset_path: if not args.datasets and not args.custom_dataset_path:
...@@ -137,6 +142,9 @@ def get_config_from_arg(args) -> Config: ...@@ -137,6 +142,9 @@ def get_config_from_arg(args) -> Config:
pad_token_id=args.pad_token_id, pad_token_id=args.pad_token_id,
run_cfg=dict(num_gpus=args.num_gpus)) run_cfg=dict(num_gpus=args.num_gpus))
models.append(model) models.append(model)
# set infer accelerator if needed
if args.accelerator in ['vllm', 'lmdeploy']:
models = change_accelerator(models, args.accelerator)
# parse summarizer args # parse summarizer args
summarizer_arg = args.summarizer if args.summarizer is not None \ summarizer_arg = args.summarizer if args.summarizer is not None \
else 'example' else 'example'
...@@ -164,6 +172,93 @@ def get_config_from_arg(args) -> Config: ...@@ -164,6 +172,93 @@ def get_config_from_arg(args) -> Config:
format_python_code=False) format_python_code=False)
def change_accelerator(models, accelerator):
models = models.copy()
model_accels = []
for model in models:
get_logger().info(f'Transforming {model["abbr"]} to {accelerator}')
# change HuggingFace model to VLLM or TurboMindModel
if model['type'] is HuggingFaceCausalLM:
gen_args = dict()
if model.get('generation_kwargs') is not None:
generation_kwargs = model['generation_kwargs'].copy()
gen_args['temperature'] = 0.001 if generation_kwargs.get(
'temperature'
) is None else generation_kwargs['temperature']
gen_args['top_k'] = 1 if generation_kwargs.get(
'top_k') is None else generation_kwargs['top_k']
gen_args['top_p'] = 0.9 if generation_kwargs.get(
'top_p') is None else generation_kwargs['top_p']
gen_args['stop_token_ids'] = None if generation_kwargs.get(
'eos_token_id'
) is None else generation_kwargs['eos_token_id']
generation_kwargs[
'stop_token_ids'] = None if generation_kwargs.get(
'eos_token_id'
) is None else generation_kwargs['eos_token_id']
generation_kwargs.pop('eos_token_id')
else:
# if generation_kwargs is not provided, set default values
generation_kwargs = dict()
gen_args['temperature'] = 0.0
gen_args['top_k'] = 1
gen_args['top_p'] = 0.9
gen_args['stop_token_ids'] = None
if accelerator == 'lmdeploy':
get_logger().info(
f'Transforming {model["abbr"]} to {accelerator}')
model = dict(
type= # noqa E251
f'{TurboMindModel.__module__}.{TurboMindModel.__name__}',
abbr=model['abbr'].replace('hf', 'lmdeploy')
if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
path=model['path'],
engine_config=dict(session_len=model['max_seq_len'],
max_batch_size=model['batch_size'],
tp=model['run_cfg']['num_gpus']),
gen_config=dict(top_k=gen_args['top_k'],
temperature=gen_args['temperature'],
top_p=gen_args['top_p'],
max_new_tokens=model['max_out_len'],
stop_words=gen_args['stop_token_ids']),
max_out_len=model['max_out_len'],
max_seq_len=model['max_seq_len'],
batch_size=model['batch_size'],
concurrency=model['batch_size'],
run_cfg=model['run_cfg'],
)
for item in ['meta_template']:
if model.get(item) is not None:
model.update(item, model[item])
elif accelerator == 'vllm':
get_logger().info(
f'Transforming {model["abbr"]} to {accelerator}')
model = dict(
type=f'{VLLM.__module__}.{VLLM.__name__}',
abbr=model['abbr'].replace('hf', 'vllm')
if '-hf' in model['abbr'] else model['abbr'] + '-vllm',
path=model['path'],
model_kwargs=dict(
tensor_parallel_size=model['run_cfg']['num_gpus']),
max_out_len=model['max_out_len'],
max_seq_len=model['max_seq_len'],
batch_size=model['batch_size'],
generation_kwargs=generation_kwargs,
run_cfg=model['run_cfg'],
)
for item in ['meta_template', 'end_str']:
if model.get(item) is not None:
model.update(item, model[item])
generation_kwargs.update(
dict(temperature=gen_args['temperature']))
else:
raise ValueError(f'Unsupported accelerator {accelerator}')
model_accels.append(model)
return model_accels
def exec_mm_infer_runner(tasks, args, cfg): def exec_mm_infer_runner(tasks, args, cfg):
"""execute multimodal infer runner according to args.""" """execute multimodal infer runner according to args."""
if args.slurm: if args.slurm:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment