Unverified Commit 3a68083e authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] update configs (#734)

parent ad96f215
......@@ -19,6 +19,13 @@ from opencompass.utils.prompt import get_prompt_hash
METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth']
METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len']
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
class DefaultSummarizer:
"""Default summarizer in OpenCompass.
......@@ -49,7 +56,13 @@ class DefaultSummarizer:
self.model_cfgs = self.cfg['models']
self.dataset_cfgs = self.cfg['datasets']
self.work_dir = self.cfg['work_dir']
self.model_abbrs = [model_abbr_from_cfg(model) for model in self.model_cfgs]
model_abbrs = []
for model in self.model_cfgs:
model_abbr = model_abbr_from_cfg_used_in_summarizer(model)
if model_abbr in model_abbrs:
continue
model_abbrs.append(model_abbr)
self.model_abbrs = model_abbrs
def _pick_up_results(self):
"""The function reads the numerical results of evaluations from the
......@@ -71,9 +84,9 @@ class DefaultSummarizer:
dataset_metrics : Dict[str, List[str]] = {}
for model in self.model_cfgs:
model_abbr = model_abbr_from_cfg(model)
parsed_results[model_abbr] = {}
raw_results[model_abbr] = {}
model_abbr = model_abbr_from_cfg_used_in_summarizer(model)
parsed_results.setdefault(model_abbr, {})
raw_results.setdefault(model_abbr, {})
for dataset in self.dataset_cfgs:
dataset_abbr = dataset_abbr_from_cfg(dataset)
filepath = get_infer_output_path(model, dataset, osp.join(self.work_dir, 'results'))
......@@ -165,23 +178,23 @@ class DefaultSummarizer:
if all(isinstance(dataset_abbr, (list, tuple)) for dataset_abbr in sg['subsets']):
group_metrics = [default_metric]
for dataset_abbr, metric in sg['subsets']:
scores.setdefault(default_metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric]
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
else:
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
if need_smart_metric and len(group_metrics) > 1:
for metric in group_metrics:
for dataset_abbr in sg['subsets']:
scores.setdefault(metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric]
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
else:
group_metrics = [default_metric]
for dataset_abbr in sg['subsets']:
metric = dataset_metrics[dataset_abbr][0]
scores.setdefault(default_metric, {})[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][metric]
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
result = parsed_results[model_abbr].get(sg['name'], {})
result = {}
for metric in scores:
if default_metric == 'standard_deviation':
avg = sum(scores[metric].values()) / len(scores[metric])
......@@ -190,7 +203,11 @@ class DefaultSummarizer:
else:
if sg.get('weights', []):
# check sg['weights'][k] != 0 in case of scores[metric][k] is NaN
numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0)
try:
numerator = sum(scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0)
except KeyError:
tmp_scores = {metric: {k.split('@')[0]: v for k, v in scores[metric].items()} for metric in scores}
numerator = sum(tmp_scores[metric][k] * sg['weights'][k] for k in sg['weights'] if sg['weights'][k] != 0)
denominator = sum(sg['weights'].values())
else:
numerator = sum(scores[metric].values())
......@@ -200,9 +217,9 @@ class DefaultSummarizer:
eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed'
# add to global results
raw_results[model_abbr][sg['name']] = scores
parsed_results[model_abbr][sg['name']] = result
dataset_metrics[sg['name']] = group_metrics
raw_results[model_abbr].setdefault(sg['name'], {}).update(scores)
parsed_results[model_abbr].setdefault(sg['name'], {}).update(result)
dataset_metrics.setdefault(sg['name'], []).extend(group_metrics)
dataset_eval_mode[sg['name']] = eval_mode
return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
......
......@@ -198,7 +198,8 @@ class OpenICLEvalTask(BaseTask):
'incorrect_bpb'] = self.calculate_bpb(pred_dicts)
else:
result['incorrect_bpb'] = result['correct_bpb'] = -1
except Exception:
except Exception as e:
self.logger.warning(f'Skip dumping details due to: {e}.')
result['incorrect_bpb'] = result['correct_bpb'] = -1
else:
result.pop('details', None)
......@@ -288,13 +289,19 @@ class OpenICLEvalTask(BaseTask):
result['predictions'] = str(predictions[i])
result['references'] = str(references[i])
result['correct'] = str(predictions[i]) == str(references[i])
else:
elif details is not None:
results['type'] = 'GEN'
result['prompt'] = origin_prediction['origin_prompt']
result['origin_prediction'] = pred_dicts[i]['prediction']
result['predictions'] = details[i]['pred']
result['references'] = details[i]['answer']
result['correct'] = details[i]['correct']
else:
results['type'] = 'GEN'
result['prompt'] = origin_prediction['origin_prompt']
result['origin_prediction'] = pred_dicts[i]['prediction']
result['predictions'] = str(predictions[i])
result['references'] = str(references[i])
results[str(i)] = result
return results
......
......@@ -19,5 +19,6 @@ def build_model_from_cfg(model_cfg: ConfigDict):
model_cfg.pop('max_out_len', None)
model_cfg.pop('batch_size', None)
model_cfg.pop('abbr', None)
model_cfg.pop('summarizer_abbr', None)
model_cfg.pop('pred_postprocessor', None)
return MODELS.build(model_cfg)
......@@ -4,6 +4,7 @@ from typing import List, Union
import tabulate
from mmengine.config import Config
from opencompass.datasets.custom import make_custom_dataset_config
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
......@@ -56,18 +57,37 @@ def get_config_from_arg(args) -> Config:
3. Huggingface parameter groups and args.datasets
"""
if args.config:
return Config.fromfile(args.config, format_python_code=False)
if args.datasets is None:
raise ValueError('You must specify "--datasets" if you do not specify '
'a config file path.')
config = Config.fromfile(args.config, format_python_code=False)
for i, dataset in enumerate(config['datasets']):
if 'type' not in dataset:
config['datasets'][i] = make_custom_dataset_config(dataset)
return config
# parse dataset args
if not args.datasets and not args.custom_dataset_path:
raise ValueError('You must specify "--datasets" or '
'"--custom-dataset-path" if you do not specify a '
'config file path.')
datasets = []
datasets_dir = os.path.join(args.config_dir, 'datasets')
for dataset in match_cfg_file(datasets_dir, args.datasets):
get_logger().info(f'Loading {dataset[0]}: {dataset[1]}')
cfg = Config.fromfile(dataset[1])
for k in cfg.keys():
if k.endswith('_datasets'):
datasets += cfg[k]
if args.datasets:
datasets_dir = os.path.join(args.config_dir, 'datasets')
for dataset in match_cfg_file(datasets_dir, args.datasets):
get_logger().info(f'Loading {dataset[0]}: {dataset[1]}')
cfg = Config.fromfile(dataset[1])
for k in cfg.keys():
if k.endswith('_datasets'):
datasets += cfg[k]
else:
dataset = {'path': args.custom_dataset_path}
if args.custom_dataset_infer_method is not None:
dataset['infer_method'] = args.custom_dataset_infer_method
if args.custom_dataset_data_type is not None:
dataset['data_type'] = args.custom_dataset_data_type
if args.custom_dataset_meta_path is not None:
dataset['meta_path'] = args.custom_dataset_meta_path
dataset = make_custom_dataset_config(dataset)
datasets.append(dataset)
# parse model args
if not args.models and not args.hf_path:
raise ValueError('You must specify a config file path, '
'or specify --models and --datasets, or '
......@@ -98,7 +118,7 @@ def get_config_from_arg(args) -> Config:
pad_token_id=args.pad_token_id,
run_cfg=dict(num_gpus=args.num_gpus))
models.append(model)
# parse summarizer args
summarizer = args.summarizer if args.summarizer is not None else 'example'
summarizers_dir = os.path.join(args.config_dir, 'summarizers')
s = match_cfg_file(summarizers_dir, [summarizer])[0]
......
......@@ -138,6 +138,9 @@ def parse_args():
# set hf args
hf_parser = parser.add_argument_group('hf_args')
parse_hf_args(hf_parser)
# set custom dataset args
custom_dataset_parser = parser.add_argument_group('custom_dataset_args')
parse_custom_dataset_args(custom_dataset_parser)
args = parser.parse_args()
if args.slurm:
assert args.partition is not None, (
......@@ -199,6 +202,18 @@ def parse_hf_args(hf_parser):
hf_parser.add_argument('--pad-token-id', type=int)
def parse_custom_dataset_args(custom_dataset_parser):
"""These args are all for the quick construction of custom datasets."""
custom_dataset_parser.add_argument('--custom-dataset-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-meta-path', type=str)
custom_dataset_parser.add_argument('--custom-dataset-data-type',
type=str,
choices=['mcq', 'qa'])
custom_dataset_parser.add_argument('--custom-dataset-infer-method',
type=str,
choices=['gen', 'ppl'])
def main():
args = parse_args()
if args.dry_run:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment