"...composable_kernel.git" did not exist on "4cf69087c4dbc260ef9016d1d80ce855c018404c"
Unverified Commit 3a68083e authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] update configs (#734)

parent ad96f215
...@@ -19,6 +19,13 @@ from opencompass.utils.prompt import get_prompt_hash ...@@ -19,6 +19,13 @@ from opencompass.utils.prompt import get_prompt_hash
METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth'] METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth']
METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len'] METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len']
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
class DefaultSummarizer: class DefaultSummarizer:
"""Default summarizer in OpenCompass. """Default summarizer in OpenCompass.
...@@ -49,7 +56,13 @@ class DefaultSummarizer: ...@@ -49,7 +56,13 @@ class DefaultSummarizer:
self.model_cfgs = self.cfg['models'] self.model_cfgs = self.cfg['models']
self.dataset_cfgs = self.cfg['datasets'] self.dataset_cfgs = self.cfg['datasets']
self.work_dir = self.cfg['work_dir'] self.work_dir = self.cfg['work_dir']
self.model_abbrs = [model_abbr_from_cfg(model) for model in self.model_cfgs] model_abbrs = []
for model in self.model_cfgs:
model_abbr = model_abbr_from_cfg_used_in_summarizer(model)
if model_abbr in model_abbrs:
continue
model_abbrs.append(model_abbr)
self.model_abbrs = model_abbrs
def _pick_up_results(self): def _pick_up_results(self):
"""The function reads the numerical results of evaluations from the """The function reads the numerical results of evaluations from the
...@@ -71,9 +84,9 @@ class DefaultSummarizer: ...@@ -71,9 +84,9 @@ class DefaultSummarizer:
dataset_metrics : Dict[str, List[str]] = {} dataset_metrics : Dict[str, List[str]] = {}
for model in self.model_cfgs: for model in self.model_cfgs:
model_abbr = model_abbr_from_cfg(model) model_abbr = model_abbr_from_cfg_used_in_summarizer(model)
parsed_results[model_abbr] = {} parsed_results.setdefault(model_abbr, {})
raw_results[model_abbr] = {} raw_results.setdefault(model_abbr, {})
for dataset in self.dataset_cfgs: for dataset in self.dataset_cfgs:
dataset_abbr = dataset_abbr_from_cfg(dataset) dataset_abbr = dataset_abbr_from_cfg(dataset)
filepath = get_infer_output_path(model, dataset, osp.join(self.work_dir, 'results')) filepath = get_infer_output_path(model, dataset, osp.join(self.work_dir, 'results'))
...@@ -165,23 +178,23 @@ class DefaultSummarizer: ...@@ -165,23 +178,23 @@ class DefaultSummarizer:
if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']): if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']):
group_metrics = [default_metric] group_metrics = [default_metric]
for dataset_abbr, metric in sg['subsets']: for dataset_abbr, metric in sg['subsets']:
scores.setdefault(default_metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric] scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
else: else:
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']])) group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
if need_smart_metric and len(group_metrics) > 1: if need_smart_metric and len(group_metrics) > 1:
for metric in group_metrics: for metric in group_metrics:
for dataset_abbr in sg['subsets']: for dataset_abbr in sg['subsets']:
scores.setdefault(metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric] scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown')) eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
else: else:
group_metrics = [default_metric] group_metrics = [default_metric]
for dataset_abbr in sg['subsets']: for dataset_abbr in sg['subsets']:
metric = dataset_metrics[dataset_abbr][0] metric = dataset_metrics[dataset_abbr][0]
scores.setdefault(default_metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric] scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
result = parsed_results[model_abbr].get(sg['name'], {}) result = {}
for metric in scores: for metric in scores:
if default_metric == 'standard_deviation': if default_metric == 'standard_deviation':
avg = sum(scores[metric].values()) / len(scores[metric]) avg = sum(scores[metric].values()) / len(scores[metric])
...@@ -190,7 +203,11 @@ class DefaultSummarizer: ...@@ -190,7 +203,11 @@ class DefaultSummarizer:
else: else:
if sg.get('weights', []): if sg.get('weights', []):
# check sg['weights'][k] != 0 in case of scores[metric][k] is NaN # check sg['weights'][k] != 0 in case of scores[metric][k] is NaN
numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0) try:
numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0)
except KeyError:
tmp_scores = {metric: {k.split('@')[0]: v for k, v in scores[metric].items()} for metric in scores}
numerator = sum(tmp_scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0)
denominator = sum(sg['weights'].values()) denominator = sum(sg['weights'].values())
else: else:
numerator = sum(scores[metric].values()) numerator = sum(scores[metric].values())
...@@ -200,9 +217,9 @@ class DefaultSummarizer: ...@@ -200,9 +217,9 @@ class DefaultSummarizer:
eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed' eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed'
# add to global results # add to global results
raw_results[model_abbr][sg['name']] = scores raw_results[model_abbr].setdefault(sg['name'], {}).update(scores)
parsed_results[model_abbr][sg['name']] = result parsed_results[model_abbr].setdefault(sg['name'], {}).update(result)
dataset_metrics[sg['name']] = group_metrics dataset_metrics.setdefault(sg['name'], []).extend(group_metrics)
dataset_eval_mode[sg['name']] = eval_mode dataset_eval_mode[sg['name']] = eval_mode
return raw_results, parsed_results, dataset_metrics, dataset_eval_mode return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
......
...@@ -198,7 +198,8 @@ class OpenICLEvalTask(BaseTask): ...@@ -198,7 +198,8 @@ class OpenICLEvalTask(BaseTask):
'incorrect_bpb'] = self.calculate_bpb(pred_dicts) 'incorrect_bpb'] = self.calculate_bpb(pred_dicts)
else: else:
result['incorrect_bpb'] = result['correct_bpb'] = -1 result['incorrect_bpb'] = result['correct_bpb'] = -1
except Exception: except Exception as e:
self.logger.warning(f'Skip dumping details due to: {e}.')
result['incorrect_bpb'] = result['correct_bpb'] = -1 result['incorrect_bpb'] = result['correct_bpb'] = -1
else: else:
result.pop('details', None) result.pop('details', None)
...@@ -288,13 +289,19 @@ class OpenICLEvalTask(BaseTask): ...@@ -288,13 +289,19 @@ class OpenICLEvalTask(BaseTask):
result['predictions'] = str(predictions[i]) result['predictions'] = str(predictions[i])
result['references'] = str(references[i]) result['references'] = str(references[i])
result['correct'] = str(predictions[i]) == str(references[i]) result['correct'] = str(predictions[i]) == str(references[i])
else: elif details is not None:
results['type'] = 'GEN' results['type'] = 'GEN'
result['prompt'] = origin_prediction['origin_prompt'] result['prompt'] = origin_prediction['origin_prompt']
result['origin_prediction'] = pred_dicts[i]['prediction'] result['origin_prediction'] = pred_dicts[i]['prediction']
result['predictions'] = details[i]['pred'] result['predictions'] = details[i]['pred']
result['references'] = details[i]['answer'] result['references'] = details[i]['answer']
result['correct'] = details[i]['correct'] result['correct'] = details[i]['correct']
else:
results['type'] = 'GEN'
result['prompt'] = origin_prediction['origin_prompt']
result['origin_prediction'] = pred_dicts[i]['prediction']
result['predictions'] = str(predictions[i])
result['references'] = str(references[i])
results[str(i)] = result results[str(i)] = result
return results return results
......
...@@ -19,5 +19,6 @@ def build_model_from_cfg(model_cfg: ConfigDict): ...@@ -19,5 +19,6 @@ def build_model_from_cfg(model_cfg: ConfigDict):
model_cfg.pop('max_out_len', None) model_cfg.pop('max_out_len', None)
model_cfg.pop('batch_size', None) model_cfg.pop('batch_size', None)
model_cfg.pop('abbr', None) model_cfg.pop('abbr', None)
model_cfg.pop('summarizer_abbr', None)
model_cfg.pop('pred_postprocessor', None) model_cfg.pop('pred_postprocessor', None)
return MODELS.build(model_cfg) return MODELS.build(model_cfg)
...@@ -4,6 +4,7 @@ from typing import List, Union ...@@ -4,6 +4,7 @@ from typing import List, Union
import tabulate import tabulate
from mmengine.config import Config from mmengine.config import Config
from opencompass.datasets.custom import make_custom_dataset_config
from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
...@@ -56,18 +57,37 @@ def get_config_from_arg(args) -> Config: ...@@ -56,18 +57,37 @@ def get_config_from_arg(args) -> Config:
3. Huggingface parameter groups and args.datasets 3. Huggingface parameter groups and args.datasets
""" """
if args.config: if args.config:
return Config.fromfile(args.config, format_python_code=False) config = Config.fromfile(args.config, format_python_code=False)
if args.datasets is None: for i, dataset in enumerate(config['datasets']):
raise ValueError('You must specify "--datasets" if you do not specify ' if 'type' not in dataset:
'a config file path.') config['datasets'][i] = make_custom_dataset_config(dataset)
return config
# parse dataset args
if not args.datasets and not args.custom_dataset_path:
raise ValueError('You must specify "--datasets" or '
'"--custom-dataset-path" if you do not specify a '
'config file path.')
datasets = [] datasets = []
datasets_dir = os.path.join(args.config_dir, 'datasets') if args.datasets:
for dataset in match_cfg_file(datasets_dir, args.datasets): datasets_dir = os.path.join(args.config_dir, 'datasets')
get_logger().info(f'Loading {dataset[0]}: {dataset[1]}') for dataset in match_cfg_file(datasets_dir, args.datasets):
cfg = Config.fromfile(dataset[1]) get_logger().info(f'Loading {dataset[0]}: {dataset[1]}')
for k in cfg.keys(): cfg = Config.fromfile(dataset[1])
if k.endswith('_datasets'): for k in cfg.keys():
datasets += cfg[k] if k.endswith('_datasets'):
datasets += cfg[k]
else:
dataset = {'path': args.custom_dataset_path}
if args.custom_dataset_infer_method is not None:
dataset['infer_method'] = args.custom_dataset_infer_method
if args.custom_dataset_data_type is not None:
dataset['data_type'] = args.custom_dataset_data_type
if args.custom_dataset_meta_path is not None:
dataset['meta_path'] = args.custom_dataset_meta_path
dataset = make_custom_dataset_config(dataset)
datasets.append(dataset)
# parse model args
if not args.models and not args.hf_path: if not args.models and not args.hf_path:
raise ValueError('You must specify a config file path, ' raise ValueError('You must specify a config file path, '
'or specify --models and --datasets, or ' 'or specify --models and --datasets, or '
...@@ -98,7 +118,7 @@ def get_config_from_arg(args) -> Config: ...@@ -98,7 +118,7 @@ def get_config_from_arg(args) -> Config:
pad_token_id=args.pad_token_id, pad_token_id=args.pad_token_id,
run_cfg=dict(num_gpus=args.num_gpus)) run_cfg=dict(num_gpus=args.num_gpus))
models.append(model) models.append(model)
# parse summarizer args
summarizer = args.summarizer if args.summarizer is not None else 'example' summarizer = args.summarizer if args.summarizer is not None else 'example'
summarizers_dir = os.path.join(args.config_dir, 'summarizers') summarizers_dir = os.path.join(args.config_dir, 'summarizers')
s = match_cfg_file(summarizers_dir, [summarizer])[0] s = match_cfg_file(summarizers_dir, [summarizer])[0]
......
...@@ -138,6 +138,9 @@ def parse_args(): ...@@ -138,6 +138,9 @@ def parse_args():
# set hf args # set hf args
hf_parser = parser.add_argument_group('hf_args') hf_parser = parser.add_argument_group('hf_args')
parse_hf_args(hf_parser) parse_hf_args(hf_parser)
# set custom dataset args
custom_dataset_parser = parser.add_argument_group('custom_dataset_args')
parse_custom_dataset_args(custom_dataset_parser)
args = parser.parse_args() args = parser.parse_args()
if args.slurm: if args.slurm:
assert args.partition is not None, ( assert args.partition is not None, (
...@@ -199,6 +202,18 @@ def parse_hf_args(hf_parser): ...@@ -199,6 +202,18 @@ def parse_hf_args(hf_parser):
hf_parser.add_argument('--pad-token-id', type=int) hf_parser.add_argument('--pad-token-id', type=int)
def parse_custom_dataset_args(custom_dataset_parser):
"""These args are all for the quick construction of custom datasets."""
custom_dataset_parser.add_argument('--custom-dataset-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-data-type',
type=str,
choices=['mcq', 'qa'])
custom_dataset_parser.add_argument('--custom-dataset-infer-method',
type=str,
choices=['gen', 'ppl'])
def main(): def main():
args = parse_args() args = parse_args()
if args.dry_run: if args.dry_run:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment