Unverified Commit 8c85edd1 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] deprecate old mbpps (#1064)

parent c1724013
......@@ -4,7 +4,7 @@
## pass@1
如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py)[configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。
如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py)[configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。
如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md)
......@@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with read_base():
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
mbpp_datasets[0]['type'] = MBPPDataset_V2
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
......@@ -64,7 +64,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with read_base():
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
humaneval_datasets[0]['num_repeats'] = 10
......
......@@ -56,6 +56,12 @@ def parse_args():
'to run',
action='store_true',
default=False)
parser.add_argument(
'--accelerator',
help='Infer accelerator, support vllm and lmdeploy now.',
choices=['vllm', 'lmdeploy', 'hg'],
default='hg',
type=str)
parser.add_argument('-m',
'--mode',
help='Running mode. You can choose "infer" if you '
......
......@@ -27,11 +27,9 @@ except ImportError:
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10
......@@ -321,7 +319,7 @@ def timeout_handler(signum, frame):
try:
signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform')
print('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds
......
......@@ -134,11 +134,20 @@ class MBPPPlusDataset(BaseDataset):
multiple responses in special cases.
"""
def processing_test(example):
example['test_case'] = example['test_list']
example['test_list'] = '\n'.join(example['test_list'])
example['test_list_2'] = example['test_list']
example['test_column'] = dict(test_list_2=example['test_list'],
task_id=example['task_id'])
return example
dataset = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
dataset.extend(
[json.loads(line.strip()) for _ in range(num_repeats)])
example = json.loads(line.strip())
example = processing_test(example)
dataset.extend([example for _ in range(num_repeats)])
return Dataset.from_list(dataset)
......@@ -211,7 +220,7 @@ class MBPPEvaluator(BaseEvaluator):
predictions)):
pred = self._process_answer(pred)
programs = self._process_test(refer, pred)
future = executor.submit(execution, programs, i, 3)
future = executor.submit(execution, programs, i, 10)
futures.append(future)
details[str(i)] = {}
details[str(i)]['origin'] = predictions[i]
......@@ -262,39 +271,34 @@ class MBPPEvaluator(BaseEvaluator):
return {f'mbpp_plus_{k}': score[k] * 100 for k in score}
def _process_answer(self, text):
try:
# for chatGLM related text
eval_text = eval(text)
except Exception:
pass
else:
if isinstance(eval_text, str):
text = eval_text
# deal with code block
if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
if len(blocks) == 0:
text = text.split('```')[1] # fall back to default strategy
else:
text = blocks[0] # fetch the first code block
if not text.startswith('\n'): # in case starting with ```xxx
text = text[max(text.find('\n') + 1, 0):]
text = text.strip()
match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
if match:
text = text[:match.start()]
match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text)
if match:
text = text[match.end():]
patterns = [
r"\[BEGIN\]\s*'(.*)'\s*\[DONE\]",
r"BEGIN\s*'(.*)'\s*\[DONE\]",
r"\[BEGIN\]\s*'(.*)'\s*DONE",
r"BEGIN\s*'(.*)'\s*DONE",
r"\[BEGIN\]\s*'(.*)\s*\[DONE\]",
r"BEGIN\s*'(.*)\s*\[DONE\]",
r"\[BEGIN\]\s*'(.*)\s*DONE",
r"BEGIN\s*'(.*)\s*DONE",
r'\[BEGIN\]\s*(.*)\s*\[DONE\]',
r'BEGIN\s*(.*)\s*\[DONE\]',
r'\[BEGIN\]\s*(.*)\s*DONE',
r'BEGIN\s*(.*)\s*DONE',
r'```python\s*(.*)\s*```',
r'```\s*(.*)\s*```',
r'(.*)\s*```.*',
r"\[BEGIN\]\s*'(.*)",
r'\[BEGIN\](.*)',
]
for p in patterns:
match = re.search(p, text, re.DOTALL)
if match:
text = match.group(1)
break
text = text.split('```')[0]
text = re.split(r"'?\s*\[?DONE\]?", text)[0]
text = text.replace('\\_', '_')
text = text.strip()
if text.startswith("'"):
text = text[1:]
if text.endswith("'"):
text = text[:-1]
text = text.replace('\\', '')
match = re.search(r'```python(.*)```', text, re.DOTALL)
if match:
text = match.group(1).strip().split('```')[0].strip()
return text
def _process_test(self, test_case, pred):
......@@ -451,7 +455,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
for pred in preds:
pred = self._process_answer(pred)
programs = self._process_test(test_case, pred)
future = executor.submit(execution, programs, task_id, 3)
future = executor.submit(execution, programs, task_id, 10)
futures.append(future)
from tqdm import tqdm
......
......@@ -27,11 +27,9 @@ except ImportError:
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10
......@@ -267,7 +265,7 @@ def timeout_handler(signum, frame):
try:
signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform')
print('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds
......
......@@ -84,7 +84,12 @@ class OpenAI(BaseAPIModel):
self.top_logprobs = top_logprobs
if isinstance(key, str):
self.keys = [os.getenv('OPENAI_API_KEY') if key == 'ENV' else key]
if key == 'ENV':
if 'OPENAI_API_KEY' not in os.environ:
raise ValueError('OpenAI API key is not set.')
self.keys = os.getenv('OPENAI_API_KEY').split(',')
else:
self.keys = [key]
else:
self.keys = key
......@@ -101,12 +106,11 @@ class OpenAI(BaseAPIModel):
self.url = openai_api_base
self.path = path
def generate(
self,
inputs: List[PromptType],
max_out_len: int = 512,
temperature: float = 0.7,
) -> List[str]:
def generate(self,
inputs: List[PromptType],
max_out_len: int = 512,
temperature: float = 0.7,
**kwargs) -> List[str]:
"""Generate results given a list of inputs.
Args:
......@@ -412,9 +416,15 @@ class OpenAIAllesAPIN(OpenAI):
}
for _ in range(self.retry):
self.wait()
raw_response = requests.post(self.url,
headers=self.headers,
data=json.dumps(data))
try:
raw_response = requests.post(self.url,
headers=self.headers,
data=json.dumps(data))
except requests.ConnectionError:
self.logger.error('Request error, got',
str(raw_response.content))
time.sleep(1)
continue
try:
response = raw_response.json()
except requests.JSONDecodeError:
......
......@@ -161,7 +161,7 @@ class Qwen(BaseAPIModel):
time.sleep(1)
continue
if response.status_code == 429:
print('Rate limited')
print(response)
time.sleep(2)
continue
if response.status_code == 400:
......
......@@ -214,6 +214,16 @@ class DLCRunner(BaseRunner):
pod_create_time = None
pri_time = None
initial_time = datetime.datetime.now()
url = 'http://pai-console.cb210e3f99cd7403f8de2a630dcc99fc3.cn-wulanchabu.alicontainer.com' # noqa: E501
logger = get_logger()
logger.debug('')
logger.debug('*' * 168)
logger.debug(
f'{url}/index?workspaceId={self.aliyun_cfg["workspace_id"]}#/dlc2/job/{job_id}/detail' # noqa: E501
)
logger.debug('*' * 168)
while True:
# 1. Avoid to request dlc too frequently.
# 2. DLC job may not be ready immediately after creation.
......
......@@ -188,6 +188,7 @@ class SlurmSequentialRunner(BaseRunner):
tmpl += f' --gres=gpu:{num_gpus}'
for extra_cmd in self.extra_command:
tmpl += f' {extra_cmd}'
tmpl += ' -x HOST-10-140-60-7'
tmpl += f" -N1 -u -J '{task_name[:512]}'" + ' {task_cmd}'
get_cmd = partial(task.get_command,
cfg_path=param_file,
......
......@@ -72,7 +72,7 @@ dataset_mapping_dict = {}
needle_counts = ['2', '3', '4', '5']
languages = ['en', 'zh']
sizes = ['4k', '8k', '32k', '200k', '1000k']
sizes = ['4k', '8k', '32k', '200k', '256k', '1000k']
types = ['origin', 'parallel']
for needle_count in needle_counts:
......@@ -190,7 +190,7 @@ def save_results_to_plots(txt_results_save_path):
numbers = [2, 3, 4, 5]
languages = ['en', 'zh']
size_exists = []
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_1000k']
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_256k', '_1000k']
for size in sizes_origin:
if size in content:
......@@ -301,6 +301,9 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
markersize=8,
label='Average Depth Score'
)
for x_value, y_value in zip(x_data, y_data):
ax2.text(x_value, y_value, f'{y_value:.2f}', ha='center', va='top')
ax2.set_ylim(0, 100)
ax2.set_yticklabels([])
......@@ -353,7 +356,7 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
new_save_path = os.path.join(directory_path, new_filename)
plt.savefig(new_save_path, format='png', bbox_inches='tight', pad_inches=0)
print(f'Saved :{new_save_path}')
print(f'Saved: {new_save_path}')
plt.close()
......
# flake8: noqa: E501
import ast
import csv
# flake8: noqa
# yapf: disable
import os
import os.path as osp
import re
......@@ -10,7 +9,7 @@ from itertools import product
import mmengine
from mmengine import ConfigDict
from prettytable import from_csv
from tabulate import tabulate
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
......@@ -18,6 +17,12 @@ from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
from .utils import get_judgeanswer_and_reference, get_outdir
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
def post_process_compass_arena(s):
if result := re.findall('(?:选择:|Choice: )([ABC])', s):
return result[0]
......@@ -68,17 +73,90 @@ class CompassArenaSummarizer:
self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.judge_models = self.cfg.get('judge_models', None)
self.meta_judge_model = self.cfg.eval.partitioner.get(
'meta_judge_model', None)
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
self.judge_type = judge_type
assert self.judge_type in ['general']
self.judge_map = {
'general': post_process_compass_arena,
}
self.judge_map = {'general': post_process_compass_arena}
self.judge_function = self.judge_map[self.judge_type]
self.check_pos_bias = check_pos_bias
self.summary_type = summary_type
def get_score(self, time_str):
output_dir, results_folder = get_outdir(self.cfg, time_str)
model_combinations = list(product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
if self.meta_judge_model is not None:
self.judge_models.append(self.meta_judge_model)
scores = {}
for idx, judge_model_cfg in enumerate(self.judge_models):
judge_model = model_abbr_from_cfg(judge_model_cfg)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
for model_pair in unique_combinations:
model1 = model_pair[0]['abbr']
model2 = model_pair[1]['abbr']
if idx == len(self.judge_models):
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
else:
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
subdir_path = os.path.join(results_folder, subdir)
if not os.path.isdir(subdir_path):
print(subdir_path + ' is not exist! please check!')
continue
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
if self.check_pos_bias:
bias_num = check_position_bias(judged_answers, references)
else:
bias_num = 0
win_model1 = defaultdict(float)
win_model2 = defaultdict(float)
categories = defaultdict(float)
model1 = references[0]['answer1']
model2 = references[0]['answer2']
for prediction, reference in zip(judged_answers, references):
categories[dataset_abbr] += 1
categories[reference['capability']] += 1
if prediction == 'A':
if reference['answer1'] == model1:
score_1, score_2 = 1, 0
else:
score_1, score_2 = 0, 1
elif prediction == 'B':
if reference['answer1'] == model1:
score_1, score_2 = 0, 1
else:
score_1, score_2 = 1, 0
elif prediction == 'C':
if self.summary_type == 'half_add':
score_1, score_2 = 0.5, 0.5
else:
score_1, score_2 = 0, 0
win_model1[reference['capability']] += score_1
win_model1[dataset_abbr] += score_1
win_model2[reference['capability']] += score_2
win_model2[dataset_abbr] += score_2
for capability in categories:
win_model1[capability] = win_model1[capability] / categories[capability] * 100
win_model1[capability] = round(win_model1[capability], 2)
win_model2[capability] = win_model2[capability] / categories[capability] * 100
win_model2[capability] = round(win_model2[capability], 2)
win_model1['position_bias'] = bias_num
win_model2['position_bias'] = bias_num
if judge_model not in scores:
scores[judge_model] = {}
if dataset_abbr not in scores[judge_model]:
scores[judge_model][dataset_abbr] = {}
scores[judge_model][dataset_abbr][model2] = win_model2
return scores
def summarize(
self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
......@@ -91,143 +169,72 @@ class CompassArenaSummarizer:
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs = self.cfg['datasets']
scores = self.get_score(time_str)
# scores['win_' + model1] = win_model1
output_dir, results_folder = get_outdir(self.cfg, time_str)
model_combinations = list(
product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs(
[combo for combo in model_combinations if combo[0] != combo[1]])
fout_list = []
pre_len = len(self.judge_models)
if self.meta_judge_model is not None:
self.judge_models.append(self.meta_judge_model)
meta_judge_model_abbr = model_abbr_from_cfg(self.meta_judge_model)
else:
meta_judge_model_abbr = None
for idx, judge_model in enumerate(self.judge_models):
judge_model = model_abbr_from_cfg(judge_model)
for dataset in dataset_cfgs:
judge_abbr = model_abbr_from_cfg(judge_model)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
if idx == pre_len:
fout = osp.join(
output_dir, 'summarized-by--' + judge_model + '-' +
dataset_abbr + '-report.csv')
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']]
row_headers = [dataset_abbr, 'position_bias'] + row_headers
headers = [''] + summarizer_model_abbrs
table = []
for row_header in row_headers:
row = [row_header]
for model_cfg in self.compare_models:
model_abbr = model_abbr_from_cfg(model_cfg)
s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '')
if isinstance(s, float):
s = f'{s:.2f}'
if isinstance(s, int):
s = str(s)
row.append(s)
table.append(row)
txt = tabulate(table, headers=headers)
print(txt)
if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
else:
fout = osp.join(
output_dir, 'judged-by--' + judge_model + '-' +
dataset_abbr + '-report.csv')
fout_list.append(fout)
for model_pair in unique_combinations:
model1, model2, = model_pair[0]['abbr'], model_pair[1][
'abbr'],
if idx == pre_len:
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
else:
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
judged_answers, references = get_judgeanswer_and_reference(
dataset,
subdir_path,
self.judge_function,
)
if self.check_pos_bias:
bias_num = check_position_bias(
judged_answers, references)
else:
bias_num = 0
win_model1, win_model2, categories = defaultdict(
float), defaultdict(float), defaultdict(float)
model1, model2 = references[0]['answer1'], references[
0]['answer2']
for prediction, reference in zip(
judged_answers, references):
if self.summary_type == 'single':
if prediction == 'A':
categories['total'] += 1
categories[reference['capability']] += 1
if reference['answer1'] == model1:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
else:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
elif prediction == 'B':
categories['total'] += 1
categories[reference['capability']] += 1
if reference['answer1'] == model1:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
else:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
elif self.summary_type == 'half_add':
categories['total'] += 1
categories[reference['capability']] += 1
if prediction == 'A':
if reference['answer1'] == model1:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
else:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
elif prediction == 'B':
if reference['answer1'] == model1:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
else:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
elif prediction == 'C':
win_model1[reference['capability']] += 0.5
win_model1['total'] += 0.5
win_model2[reference['capability']] += 0.5
win_model2['total'] += 0.5
for capability in categories:
if capability not in win_model1:
win_model1[capability] = 0.0
else:
win_model1[capability] = round(
(win_model1[capability] /
categories[capability]) * 100, 2)
if capability not in win_model2:
win_model2[capability] = 0.0
else:
win_model2[capability] = round(
(win_model2[capability] /
categories[capability]) * 100, 2)
win_model1['position_bias'] = bias_num
win_model2['position_bias'] = bias_num
scores = {
'win_' + model1: win_model1,
'win_' + model2: win_model2
}
rows = list(scores.keys())
columns = list(scores[rows[0]].keys())
columns.insert(0, columns.pop(columns.index('total')))
columns.insert(
1, columns.pop(columns.index('position_bias')))
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([model1 + '_vs_' + model2] +
columns)
for row in rows:
writer.writerow([row] + [
scores[row][column] for column in columns
])
else:
print(subdir_path + ' is not exist! please check!')
for fout in fout_list:
with open(fout, 'r') as f:
x = from_csv(f)
print(fout)
print(x)
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
with open(output_filename, 'w') as f:
f.write(','.join(headers) + '\n')
for line in table:
f.write(','.join(line) + '\n')
print(output_filename)
table = []
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
headers = [''] + summarizer_model_abbrs
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
row = [dataset_abbr]
for model_cfg in self.compare_models:
model_abbr = model_abbr_from_cfg(model_cfg)
s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '')
if isinstance(s, float):
s = f'{s:.2f}'
if isinstance(s, int):
s = str(s)
row.append(s)
table.append(row)
txt = tabulate(table, headers=headers)
print(txt)
if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv')
else:
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv')
with open(output_filename, 'w') as f:
f.write(','.join(headers) + '\n')
for line in table:
f.write(','.join(line) + '\n')
print(output_filename)
# flake8: noqa: E501
# flake8: noqa
# yapf: disable
import csv
import os
import os.path as osp
......@@ -8,11 +9,7 @@ from datetime import datetime
import numpy as np
from mmengine import ConfigDict
try:
from prettytable import from_csv
except ImportError:
from_csv = None
from tabulate import tabulate
from opencompass.utils import model_abbr_from_cfg
......@@ -20,6 +17,12 @@ from .compass_arena import CompassArenaSummarizer
from .utils import get_judgeanswer_and_reference, get_outdir
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
def post_process_mtbench_pair(judgement: str):
"""Input a string like below:
......@@ -52,7 +55,7 @@ def get_capability_results(
references,
fout,
fout_flag,
model,
model_abbr,
):
capability_ratings = defaultdict(int)
capability_counts = defaultdict(int)
......@@ -70,12 +73,12 @@ def get_capability_results(
capability_avg_ratings[capability] = s
columns = list(capability_avg_ratings.keys())
columns.insert(0, columns.pop(columns.index('total')))
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
if fout_flag == 0:
writer.writerow(['model'] + columns)
writer.writerow([model] +
[capability_avg_ratings[column] for column in columns])
writer.writerow([model_abbr] + [capability_avg_ratings[column] for column in columns])
class MTBenchSummarizer(CompassArenaSummarizer):
......@@ -92,13 +95,9 @@ class MTBenchSummarizer(CompassArenaSummarizer):
self.cfg = config
if self.judge_type == 'single':
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]
elif self.judge_type == 'pair':
self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner'][
'compare_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
self.judge_map = {
'single': post_process_mtbench_single,
......@@ -106,8 +105,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
}
self.judge_function = self.judge_map[self.judge_type]
def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
......@@ -116,33 +114,40 @@ class MTBenchSummarizer(CompassArenaSummarizer):
Returns:
pd.DataFrame: The summary results.
"""
if self.judge_type == 'single':
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0
for eval_model_abbr in self.eval_model_abbrs:
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
model, judge_model = eval_model_abbr, self.judge_abbr
fout = osp.join(
output_dir,
'judged-by--' + judge_model + '-capability.csv')
overall_judged_answers, overall_references = [], []
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(
dataset, subdir_path, self.judge_function)
overall_judged_answers += judged_answers
overall_references += references
get_capability_results(overall_judged_answers,
overall_references, fout, fout_flag,
model)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f:
x = from_csv(f)
print(x)
print(fout)
elif self.judge_type == 'pair':
super().summarize()
if self.judge_type == 'pair':
return super().summarize()
# self.judge_type == 'single'
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0
for eval_model_cfg in self.eval_model_cfgs:
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
if os.path.isdir(subdir_path):
fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability.csv')
overall_judged_answers, overall_references = [], []
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
overall_judged_answers += judged_answers
overall_references += references
get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f:
csv_reader = csv.reader(f)
header = next(csv_reader)
table = [line for line in csv_reader]
new_header = [''] + [line[0] for line in table]
new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
t = tabulate(new_table, headers=new_header)
with open(fout, 'w') as f:
f.write(','.join(new_header) + '\n')
for line in new_table:
f.write(','.join(map(str, line)) + '\n')
print(t)
print(fout)
......@@ -3,6 +3,7 @@ import copy
import fnmatch
import math
import os.path as osp
import re
import statistics
import time
from collections import Counter
......@@ -38,12 +39,12 @@ def extract_role_pred(s: str, begin_str: Optional[str],
start = 0
end = len(s)
if begin_str:
if begin_str and re.match(r'\s*', begin_str) is None:
begin_idx = s.find(begin_str)
if begin_idx != -1:
start = begin_idx + len(begin_str)
if end_str:
if end_str and re.match(r'\s*', end_str) is None:
# TODO: Support calling tokenizer for the accurate eos token
# and avoid such hardcode
end_idx = s.find(end_str, start)
......
# flake8: noqa: E501
import copy
import json
import os
import os.path as osp
import mmengine
......@@ -123,6 +124,10 @@ class AlpacaEvalTask(BaseTask):
command = ''
if api_key is not None:
command += f'export OPENAI_API_KEY={api_key}; '
else:
api_key = os.environ.get('OPENAI_API_KEY', '').split(',')[0]
if api_key:
command += f'export OPENAI_API_KEY={api_key}; '
command += f'alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}'
return template.format(task_cmd=command)
......
......@@ -5,6 +5,7 @@ import tabulate
from mmengine.config import Config
from opencompass.datasets.custom import make_custom_dataset_config
from opencompass.models import VLLM, HuggingFaceCausalLM, TurboMindModel
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
......@@ -72,6 +73,10 @@ def get_config_from_arg(args) -> Config:
if args.config:
config = Config.fromfile(args.config, format_python_code=False)
config = try_fill_in_custom_cfgs(config)
# set infer accelerator if needed
if args.accelerator in ['vllm', 'lmdeploy']:
config['models'] = change_accelerator(config['models'],
args.accelerator)
return config
# parse dataset args
if not args.datasets and not args.custom_dataset_path:
......@@ -137,6 +142,9 @@ def get_config_from_arg(args) -> Config:
pad_token_id=args.pad_token_id,
run_cfg=dict(num_gpus=args.num_gpus))
models.append(model)
# set infer accelerator if needed
if args.accelerator in ['vllm', 'lmdeploy']:
models = change_accelerator(models, args.accelerator)
# parse summarizer args
summarizer_arg = args.summarizer if args.summarizer is not None \
else 'example'
......@@ -164,6 +172,93 @@ def get_config_from_arg(args) -> Config:
format_python_code=False)
def change_accelerator(models, accelerator):
models = models.copy()
model_accels = []
for model in models:
get_logger().info(f'Transforming {model["abbr"]} to {accelerator}')
# change HuggingFace model to VLLM or TurboMindModel
if model['type'] is HuggingFaceCausalLM:
gen_args = dict()
if model.get('generation_kwargs') is not None:
generation_kwargs = model['generation_kwargs'].copy()
gen_args['temperature'] = 0.001 if generation_kwargs.get(
'temperature'
) is None else generation_kwargs['temperature']
gen_args['top_k'] = 1 if generation_kwargs.get(
'top_k') is None else generation_kwargs['top_k']
gen_args['top_p'] = 0.9 if generation_kwargs.get(
'top_p') is None else generation_kwargs['top_p']
gen_args['stop_token_ids'] = None if generation_kwargs.get(
'eos_token_id'
) is None else generation_kwargs['eos_token_id']
generation_kwargs[
'stop_token_ids'] = None if generation_kwargs.get(
'eos_token_id'
) is None else generation_kwargs['eos_token_id']
generation_kwargs.pop('eos_token_id')
else:
# if generation_kwargs is not provided, set default values
generation_kwargs = dict()
gen_args['temperature'] = 0.0
gen_args['top_k'] = 1
gen_args['top_p'] = 0.9
gen_args['stop_token_ids'] = None
if accelerator == 'lmdeploy':
get_logger().info(
f'Transforming {model["abbr"]} to {accelerator}')
model = dict(
type= # noqa E251
f'{TurboMindModel.__module__}.{TurboMindModel.__name__}',
abbr=model['abbr'].replace('hf', 'lmdeploy')
if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
path=model['path'],
engine_config=dict(session_len=model['max_seq_len'],
max_batch_size=model['batch_size'],
tp=model['run_cfg']['num_gpus']),
gen_config=dict(top_k=gen_args['top_k'],
temperature=gen_args['temperature'],
top_p=gen_args['top_p'],
max_new_tokens=model['max_out_len'],
stop_words=gen_args['stop_token_ids']),
max_out_len=model['max_out_len'],
max_seq_len=model['max_seq_len'],
batch_size=model['batch_size'],
concurrency=model['batch_size'],
run_cfg=model['run_cfg'],
)
for item in ['meta_template']:
if model.get(item) is not None:
model.update(item, model[item])
elif accelerator == 'vllm':
get_logger().info(
f'Transforming {model["abbr"]} to {accelerator}')
model = dict(
type=f'{VLLM.__module__}.{VLLM.__name__}',
abbr=model['abbr'].replace('hf', 'vllm')
if '-hf' in model['abbr'] else model['abbr'] + '-vllm',
path=model['path'],
model_kwargs=dict(
tensor_parallel_size=model['run_cfg']['num_gpus']),
max_out_len=model['max_out_len'],
max_seq_len=model['max_seq_len'],
batch_size=model['batch_size'],
generation_kwargs=generation_kwargs,
run_cfg=model['run_cfg'],
)
for item in ['meta_template', 'end_str']:
if model.get(item) is not None:
model.update(item, model[item])
generation_kwargs.update(
dict(temperature=gen_args['temperature']))
else:
raise ValueError(f'Unsupported accelerator {accelerator}')
model_accels.append(model)
return model_accels
def exec_mm_infer_runner(tasks, args, cfg):
"""execute multimodal infer runner according to args."""
if args.slurm:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment