initial commit

7d346000 · gaotongxiao · 7d346000 · 7d346000 · 7d346000 · 7d346000
Commit 7d346000 authored Jul 04, 2023 by gaotongxiao
8 changed files
--- a/run.py
+++ b/run.py
+import argparse
+import getpass
+import os
+import os.path as osp
+from datetime import datetime
+from mmengine.config import Config
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
+from opencompass.utils import LarkReporter, Summarizer, get_logger
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run an evaluation task')
+    parser.add_argument('config', help='Train config file path')
+    # add mutually exclusive args `--slurm` `--dlc`, default to local runner
+    luach_method = parser.add_mutually_exclusive_group()
+    luach_method.add_argument('--slurm',
+                              action='store_true',
+                              default=False,
+                              help='Whether to use srun to launch tasks, if '
+                              'True, `--partition(-p)` must be set. Defaults'
+                              ' to False')
+    luach_method.add_argument('--dlc',
+                              action='store_true',
+                              default=False,
+                              help='Whether to use dlc to launch tasks, if '
+                              'True, `--aliyun-cfg` must be set. Defaults'
+                              ' to False')
+    # add general args
+    parser.add_argument('--debug',
+                        help='Debug mode, in which scheduler will run tasks '
+                        'in the single process, and output will not be '
+                        'redirected to files',
+                        action='store_true',
+                        default=False)
+    parser.add_argument('-m',
+                        '--mode',
+                        help='Running mode. You can choose "infer" if you '
+                        'only want the inference results, or "eval" if you '
+                        'already have the results and want to evaluate them, '
+                        'or "viz" if you want to visualize the results.',
+                        choices=['all', 'infer', 'eval', 'viz'],
+                        default='all',
+                        type=str)
+    parser.add_argument('-r',
+                        '--reuse',
+                        nargs='?',
+                        type=str,
+                        const='latest',
+                        help='Reuse previous outputs & results, and run any '
+                        'missing jobs presented in the config. If its '
+                        'argument is not specified, the latest results in '
+                        'the work_dir will be reused. The argument should '
+                        'also be a specific timestamp, e.g. 20230516_144254'),
+    parser.add_argument('-w',
+                        '--work-dir',
+                        help='Work path, all the outputs will be saved in '
+                        'this path, including the slurm logs, the evaluation'
+                        ' results, the summary results, etc. If not specified,'
+                        ' the work_dir will be set to None',
+                        default=None,
+                        type=str)
+    parser.add_argument('-l',
+                        '--lark',
+                        help='Report the running status to lark bot',
+                        action='store_true',
+                        default=False)
+    parser.add_argument('--max-partition-size',
+                        help='The maximum size of a task.',
+                        type=int,
+                        default=2000),
+    parser.add_argument(
+        '--gen-task-coef',
+        help='The dataset cost measurement coefficient for generation tasks',
+        type=int,
+        default=20)
+    parser.add_argument('--max-num-workers',
+                        help='Max number of workers to run in parallel.',
+                        type=int,
+                        default=32)
+    parser.add_argument(
+        '--retry',
+        help='Number of retries if the job failed when using slurm or dlc.',
+        type=int,
+        default=2)
+    # set srun args
+    slurm_parser = parser.add_argument_group('slurm_args')
+    parse_slurm_args(slurm_parser)
+    # set dlc args
+    dlc_parser = parser.add_argument_group('dlc_args')
+    parse_dlc_args(dlc_parser)
+    args = parser.parse_args()
+    if args.slurm:
+        assert args.partition is not None, (
+            '--partition(-p) must be set if you want to use slurm')
+    if args.dlc:
+        assert os.path.exists(args.aliyun_cfg), (
+            'When luaching tasks using dlc, it needs to be configured'
+            'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
+            ' to specify a new path.')
+    return args
+def parse_slurm_args(slurm_parser):
+    """these args are all for slurm launch."""
+    slurm_parser.add_argument('-p',
+                              '--partition',
+                              help='Slurm partition name',
+                              default=None,
+                              type=str)
+    slurm_parser.add_argument('-q',
+                              '--quotatype',
+                              help='Slurm quota type',
+                              default='auto',
+                              type=str)
+def parse_dlc_args(dlc_parser):
+    """these args are all for dlc launch."""
+    dlc_parser.add_argument('--aliyun-cfg',
+                            help='The config path for aliyun config',
+                            default='~/.aliyun.cfg',
+                            type=str)
+def main():
+    args = parse_args()
+    # initialize logger
+    logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')
+    cfg = Config.fromfile(args.config)
+    if args.work_dir is not None:
+        cfg['work_dir'] = args.work_dir
+    else:
+        cfg.setdefault('work_dir', './outputs/default/')
+    # cfg_time_str defaults to the current time
+    cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
+    if args.reuse:
+        if args.reuse == 'latest':
+            dirs = os.listdir(cfg.work_dir)
+            assert len(dirs) > 0, 'No previous results to reuse!'
+            dir_time_str = sorted(dirs)[-1]
+        else:
+            dir_time_str = args.reuse
+        logger.info(f'Reusing experiements from {dir_time_str}')
+    elif args.mode in ['eval', 'viz']:
+        raise ValueError('You must specify -r or --reuse when running in eval '
+                         'or viz mode!')
+    # update "actual" work_dir
+    cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
+    os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
+    # dump config
+    output_config_path = osp.join(cfg.work_dir, 'configs',
+                                  f'{cfg_time_str}.py')
+    cfg.dump(output_config_path)
+    # Config is intentally reloaded here to avoid initialized
+    # types cannot be serialized
+    cfg = Config.fromfile(output_config_path)
+    # report to lark bot if specify --lark
+    if not args.lark:
+        cfg['lark_bot_url'] = None
+    elif cfg.get('lark_bot_url', None):
+        content = f'{getpass.getuser()}\'s task has been launched!'
+        LarkReporter(cfg['lark_bot_url']).post(content)
+    if args.mode in ['all', 'infer']:
+        # Use SizePartitioner to split into subtasks
+        partitioner = SizePartitioner(osp.join(cfg['work_dir'],
+                                               'predictions/'),
+                                      max_task_size=args.max_partition_size,
+                                      gen_task_coef=args.gen_task_coef)
+        tasks = partitioner(cfg)
+        # execute the infer subtasks
+        exec_infer_runner(tasks, args, cfg)
+    # evaluate
+    if args.mode in ['all', 'eval']:
+        # Use NaivePartitioner，not split
+        partitioner = NaivePartitioner(osp.join(cfg['work_dir'], 'results/'))
+        tasks = partitioner(cfg)
+        # execute the eval tasks
+        exec_eval_runner(tasks, args, cfg)
+    # visualize
+    if args.mode in ['all', 'eval', 'viz']:
+        summarizer = Summarizer(cfg)
+        summarizer.summarize(time_str=cfg_time_str)
+def exec_infer_runner(tasks, args, cfg):
+    """execute infer runner according to args."""
+    if args.slurm:
+        runner = SlurmRunner(dict(type='OpenICLInferTask'),
+                             max_num_workers=args.max_num_workers,
+                             partition=args.partition,
+                             quotatype=args.quotatype,
+                             retry=args.retry,
+                             debug=args.debug,
+                             lark_bot_url=cfg['lark_bot_url'])
+    elif args.dlc:
+        runner = DLCRunner(dict(type='OpenICLInferTask'),
+                           max_num_workers=args.max_num_workers,
+                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
+                           retry=args.retry,
+                           debug=args.debug,
+                           lark_bot_url=cfg['lark_bot_url'])
+    else:
+        runner = LocalRunner(
+            task=dict(type='OpenICLInferTask'),
+            # max_num_workers = args.max_num_workers,
+            debug=args.debug,
+            lark_bot_url=cfg['lark_bot_url'])
+    runner(tasks)
+def exec_eval_runner(tasks, args, cfg):
+    """execute infer runner according to args."""
+    if args.slurm:
+        runner = SlurmRunner(dict(type='OpenICLEvalTask'),
+                             max_num_workers=args.max_num_workers,
+                             partition=args.partition,
+                             quotatype=args.quotatype,
+                             retry=args.retry,
+                             debug=args.debug,
+                             lark_bot_url=cfg['lark_bot_url'])
+    elif args.dlc:
+        runner = DLCRunner(dict(type='OpenICLEvalTask'),
+                           max_num_workers=args.max_num_workers,
+                           aliyun_cfg=Config.fromfile(args.aliyun_cfg),
+                           retry=args.retry,
+                           debug=args.debug,
+                           lark_bot_url=cfg['lark_bot_url'])
+    else:
+        runner = LocalRunner(
+            task=dict(type='OpenICLEvalTask'),
+            # max_num_workers = args.max_num_workers,
+            debug=args.debug,
+            lark_bot_url=cfg['lark_bot_url'])
+    runner(tasks)
+if __name__ == '__main__':
+    main()
--- a/tests/openicl/test_prompt_template.py
+++ b/tests/openicl/test_prompt_template.py
+import unittest
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.utils.prompt import PromptList
+class TestPromptTemplate(unittest.TestCase):
+    def setUp(self) -> None:
+        self.qa_template = dict(begin=[
+            dict(role='SYSTEM', fallback_role='HUMAN', prompt='instruct'),
+            '</E>',
+        ],
+                                round=[
+                                    dict(role='HUMAN', prompt='</input>'),
+                                    dict(role='BOT',
+                                         prompt='Answer: </answer>')
+                                ])
+        self.multiround_qa_template = dict(round=[
+            dict(role='HUMAN', prompt='</input>'),
+            dict(role='BOT', prompt='A1', end='\n'),
+            dict(role='HUMAN', prompt='Q1'),
+            dict(role='BOT', prompt='A2', end='\n\n'),
+            dict(role='HUMAN', prompt='Q2', begin='HUMAN:'),
+            dict(role='BOT', prompt='Answer: </answer>')
+        ])
+        self.column_token_map = {
+            'input': '</input>',
+            'answer': '</answer>',
+        }
+        self.entry = {'input': 'Hello, how are you?', 'answer': 'Good.'}
+    def test_init(self):
+        template = 'Translate the following English text to French: {t}.'
+        column_token_map = {'input': '{t}'}
+        pt = PromptTemplate(template, column_token_map)
+        self.assertEqual(pt.template, template)
+        self.assertEqual(pt.column_token_map, column_token_map)
+    def test_generate_ice_item(self):
+        # Test simple prompt
+        template = 'Translate the following English text to French: {t}.'
+        column_token_map = {'input': '{t}'}
+        pt = PromptTemplate(template, column_token_map)
+        label = None
+        ice = pt.generate_ice_item(self.entry, label)
+        self.assertEqual(ice,
+                         ('Translate the following English text to French: '
+                          'Hello, how are you?.'))
+        # test meta prompt style
+        pt = PromptTemplate(self.qa_template,
+                            self.column_token_map,
+                            ice_token='</E>')
+        label = None
+        ice = pt.generate_ice_item(self.entry, label)
+        ice_target = PromptList([
+            {
+                'section': 'ice',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='Hello, how are you?'),
+            dict(role='BOT', prompt='Answer: Good.'),
+            {
+                'section': 'ice',
+                'pos': 'end'
+            },
+        ])
+        self.assertEqual(ice, ice_target)
+        # test_multiround
+        pt = PromptTemplate(self.multiround_qa_template,
+                            self.column_token_map,
+                            ice_token='</E>')
+        label = None
+        ice = pt.generate_ice_item(self.entry, label)
+        ice_target = PromptList([
+            {
+                'section': 'ice',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='Hello, how are you?'),
+            dict(role='BOT', prompt='A1', end='\n'),
+            dict(role='HUMAN', prompt='Q1'),
+            dict(role='BOT', prompt='A2', end='\n\n'),
+            dict(role='HUMAN', prompt='Q2', begin='HUMAN:'),
+            dict(role='BOT', prompt='Answer: Good.'),
+            {
+                'section': 'ice',
+                'pos': 'end'
+            },
+        ])
+        self.assertEqual(ice, ice_target)
+    def test_generate_label_prompt_item(self):
+        # Test simple prompt
+        template = '</E> Translate the following English text to French: {t}.'
+        column_token_map = {'input': '{t}'}
+        pt = PromptTemplate(template, column_token_map, ice_token='</E>')
+        ice = 'ICE'
+        label = None
+        prompt = pt.generate_label_prompt_item(self.entry, ice, label)
+        self.assertEqual(
+            prompt, ('ICE Translate the following English text to French: '
+                     'Hello, how are you?.'))
+        ice = PromptList([
+            {
+                'section': 'ice',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='h1'),
+            dict(role='BOT', prompt='b1'),
+            {
+                'section': 'ice',
+                'pos': 'end'
+            },
+        ])
+        # test meta prompt style
+        pt = PromptTemplate(self.qa_template,
+                            self.column_token_map,
+                            ice_token='</E>')
+        label = None
+        prompt = pt.generate_label_prompt_item(self.entry, ice, label)
+        target = PromptList([
+            {
+                'section': 'begin',
+                'pos': 'begin'
+            },
+            dict(role='SYSTEM', fallback_role='HUMAN', prompt='instruct'),
+            {
+                'section': 'ice',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='h1'),
+            dict(role='BOT', prompt='b1'),
+            {
+                'section': 'ice',
+                'pos': 'end'
+            },
+            {
+                'section': 'begin',
+                'pos': 'end'
+            },
+            {
+                'section': 'round',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='Hello, how are you?'),
+            dict(role='BOT', prompt='Answer: Good.'),
+            {
+                'section': 'round',
+                'pos': 'end'
+            },
+        ])
+        self.assertEqual(prompt, target)
+        # test_multiround
+        pt = PromptTemplate(self.multiround_qa_template,
+                            self.column_token_map,
+                            ice_token='</E>')
+        label = None
+        prompt = pt.generate_label_prompt_item(self.entry, ice, label)
+        target = PromptList([
+            {
+                'section': 'round',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='Hello, how are you?'),
+            dict(role='BOT', prompt='A1', end='\n'),
+            dict(role='HUMAN', prompt='Q1'),
+            dict(role='BOT', prompt='A2', end='\n\n'),
+            dict(role='HUMAN', prompt='Q2', begin='HUMAN:'),
+            dict(role='BOT', prompt='Answer: Good.'),
+            {
+                'section': 'round',
+                'pos': 'end'
+            },
+        ])
+        self.assertEqual(prompt, target)
+    def test_generate_item(self):
+        # Test simple prompt
+        template = 'Translate the following English text to French: {t}.'
+        column_token_map = {'input': '{t}'}
+        pt = PromptTemplate(template, column_token_map)
+        item = pt.generate_item(self.entry)
+        self.assertEqual(item,
+                         ('Translate the following English text to French: '
+                          'Hello, how are you?.'))
+        ice = PromptList([
+            {
+                'section': 'ice',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='h1'),
+            dict(role='BOT', prompt='b1'),
+            {
+                'section': 'ice',
+                'pos': 'end'
+            },
+        ])
+        # test meta prompt (without system role)
+        pt = PromptTemplate(self.qa_template,
+                            self.column_token_map,
+                            ice_token='</E>')
+        prompt = pt.generate_item(self.entry, ice_field_replace_token=ice)
+        target = PromptList([
+            {
+                'section': 'begin',
+                'pos': 'begin'
+            },
+            dict(role='SYSTEM', fallback_role='HUMAN', prompt='instruct'),
+            {
+                'section': 'ice',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='h1'),
+            dict(role='BOT', prompt='b1'),
+            {
+                'section': 'ice',
+                'pos': 'end'
+            },
+            {
+                'section': 'begin',
+                'pos': 'end'
+            },
+            {
+                'section': 'round',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='Hello, how are you?'),
+            dict(role='BOT', prompt='Answer: Good.'),
+            {
+                'section': 'round',
+                'pos': 'end'
+            },
+        ])
+        self.assertEqual(prompt, target)
+        pt = PromptTemplate(self.multiround_qa_template,
+                            self.column_token_map,
+                            ice_token='</E>')
+        prompt = pt.generate_item(self.entry, ice_field_replace_token=ice)
+        target = PromptList([
+            {
+                'section': 'round',
+                'pos': 'begin'
+            },
+            dict(role='HUMAN', prompt='Hello, how are you?'),
+            dict(role='BOT', prompt='A1', end='\n'),
+            dict(role='HUMAN', prompt='Q1'),
+            dict(role='BOT', prompt='A2', end='\n\n'),
+            dict(role='HUMAN', prompt='Q2', begin='HUMAN:'),
+            dict(role='BOT', prompt='Answer: Good.'),
+            {
+                'section': 'round',
+                'pos': 'end'
+            },
+        ])
+        self.assertEqual(prompt, target)
--- a/tests/prompt/test_api_template_parser.py
+++ b/tests/prompt/test_api_template_parser.py
+import unittest
+from opencompass.models.base_api import APITemplateParser
+from opencompass.utils.prompt import PromptList
+class TestAPITemplateParser(unittest.TestCase):
+    def setUp(self):
+        self.parser = APITemplateParser()
+        self.prompt = PromptList([
+            {
+                'section': 'begin',
+                'pos': 'begin'
+            },
+            'begin',
+            {
+                'role': 'SYSTEM',
+                'fallback_role': 'HUMAN',
+                'prompt': 'system msg'
+            },
+            {
+                'section': 'ice',
+                'pos': 'begin'
+            },
+            {
+                'role': 'HUMAN',
+                'prompt': 'U0'
+            },
+            {
+                'role': 'BOT',
+                'prompt': 'B0'
+            },
+            {
+                'section': 'ice',
+                'pos': 'end'
+            },
+            {
+                'section': 'begin',
+                'pos': 'end'
+            },
+            {
+                'section': 'round',
+                'pos': 'begin'
+            },
+            {
+                'role': 'HUMAN',
+                'prompt': 'U1'
+            },
+            {
+                'role': 'BOT',
+                'prompt': 'B1'
+            },
+            {
+                'role': 'HUMAN',
+                'prompt': 'U2'
+            },
+            {
+                'role': 'BOT',
+                'prompt': 'B2'
+            },
+            {
+                'section': 'round',
+                'pos': 'end'
+            },
+            {
+                'section': 'end',
+                'pos': 'begin'
+            },
+            'end',
+            {
+                'section': 'end',
+                'pos': 'end'
+            },
+        ])
+    def test_parse_template_str_input(self):
+        prompt = self.parser.parse_template('Hello, world!', mode='gen')
+        self.assertEqual(prompt, 'Hello, world!')
+        prompt = self.parser.parse_template('Hello, world!', mode='ppl')
+        self.assertEqual(prompt, 'Hello, world!')
+    def test_parse_template_list_input(self):
+        prompt = self.parser.parse_template(['Hello', 'world'], mode='gen')
+        self.assertEqual(prompt, ['Hello', 'world'])
+        prompt = self.parser.parse_template(['Hello', 'world'], mode='ppl')
+        self.assertEqual(prompt, ['Hello', 'world'])
+    def test_parse_template_PromptList_input_no_meta_template(self):
+        prompt = self.parser.parse_template(self.prompt, mode='gen')
+        self.assertEqual(prompt,
+                         'begin\nsystem msg\nU0\nB0\nU1\nB1\nU2\nB2\nend')
+        prompt = self.parser.parse_template(self.prompt, mode='ppl')
+        self.assertEqual(prompt,
+                         'begin\nsystem msg\nU0\nB0\nU1\nB1\nU2\nB2\nend')
+    def test_parse_template_PromptList_input_with_meta_template(self):
+        parser = APITemplateParser(meta_template=dict(round=[
+            dict(role='HUMAN', api_role='HUMAN'),
+            dict(role='BOT', api_role='BOT', generate=True)
+        ], ))
+        with self.assertWarns(Warning):
+            prompt = parser.parse_template(self.prompt, mode='gen')
+            self.assertEqual(
+                prompt,
+                PromptList([
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'system msg\nU0'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B0'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U1'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B1'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U2'
+                    },
+                ]))
+        with self.assertWarns(Warning):
+            prompt = parser.parse_template(self.prompt, mode='ppl')
+            self.assertEqual(
+                prompt,
+                PromptList([
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'system msg\nU0'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B0'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U1'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B1'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U2'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B2'
+                    },
+                ]))
+        parser = APITemplateParser(meta_template=dict(
+            round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                dict(role='BOT', api_role='BOT', generate=True)
+            ],
+            reserved_roles=[
+                dict(role='SYSTEM', api_role='SYSTEM'),
+            ],
+        ))
+        with self.assertWarns(Warning):
+            prompt = parser.parse_template(self.prompt, mode='gen')
+            self.assertEqual(
+                prompt,
+                PromptList([
+                    {
+                        'role': 'SYSTEM',
+                        'prompt': 'system msg'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U0'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B0'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U1'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B1'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U2'
+                    },
+                ]))
+        with self.assertWarns(Warning):
+            prompt = parser.parse_template(self.prompt, mode='ppl')
+            self.assertEqual(
+                prompt,
+                PromptList([
+                    {
+                        'role': 'SYSTEM',
+                        'prompt': 'system msg'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U0'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B0'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U1'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B1'
+                    },
+                    {
+                        'role': 'HUMAN',
+                        'prompt': 'U2'
+                    },
+                    {
+                        'role': 'BOT',
+                        'prompt': 'B2'
+                    },
+                ]))
--- a/tests/prompt/test_lm_template_parser.py
+++ b/tests/prompt/test_lm_template_parser.py
+import unittest
+from opencompass.models.base import LMTemplateParser
+from opencompass.utils.prompt import PromptList
+class TestLMTemplateParser(unittest.TestCase):
+    def setUp(self):
+        self.parser = LMTemplateParser()
+        self.prompt = PromptList([
+            {
+                'section': 'begin',
+                'pos': 'begin'
+            },
+            'begin',
+            {
+                'role': 'SYSTEM',
+                'fallback_role': 'HUMAN',
+                'prompt': 'system msg'
+            },
+            {
+                'section': 'ice',
+                'pos': 'begin'
+            },
+            {
+                'role': 'HUMAN',
+                'prompt': 'U0'
+            },
+            {
+                'role': 'BOT',
+                'prompt': 'B0'
+            },
+            {
+                'section': 'ice',
+                'pos': 'end'
+            },
+            {
+                'section': 'begin',
+                'pos': 'end'
+            },
+            {
+                'section': 'round',
+                'pos': 'begin'
+            },
+            {
+                'role': 'HUMAN',
+                'prompt': 'U1',
+                'end': '\n'
+            },
+            {
+                'role': 'BOT',
+                'prompt': 'B1'
+            },
+            {
+                'role': 'HUMAN',
+                'prompt': 'U2'
+            },
+            {
+                'role': 'BOT',
+                'prompt': 'B2'
+            },
+            {
+                'section': 'round',
+                'pos': 'end'
+            },
+            {
+                'section': 'end',
+                'pos': 'begin'
+            },
+            'end',
+            {
+                'section': 'end',
+                'pos': 'end'
+            },
+        ])
+    def test_parse_template_str_input(self):
+        prompt = self.parser.parse_template('Hello, world!', mode='gen')
+        self.assertEqual(prompt, 'Hello, world!')
+        prompt = self.parser.parse_template('Hello, world!', mode='ppl')
+        self.assertEqual(prompt, 'Hello, world!')
+    def test_parse_template_list_input(self):
+        prompt = self.parser.parse_template(['Hello', 'world'], mode='gen')
+        self.assertEqual(prompt, ['Hello', 'world'])
+        prompt = self.parser.parse_template(['Hello', 'world'], mode='ppl')
+        self.assertEqual(prompt, ['Hello', 'world'])
+    def test_parse_template_PromptList_input_no_meta_template(self):
+        prompt = self.parser.parse_template(self.prompt, mode='gen')
+        self.assertEqual(prompt,
+                         'begin\nsystem msg\nU0\nB0\nU1\nB1\nU2\nB2\nend')
+        prompt = self.parser.parse_template(self.prompt, mode='ppl')
+        self.assertEqual(prompt,
+                         'begin\nsystem msg\nU0\nB0\nU1\nB1\nU2\nB2\nend')
+    def test_parse_template_PromptList_input_with_meta_template(self):
+        # no SYSTEM role, early generation in THOUGHTS
+        parser = LMTemplateParser(meta_template=dict(
+            begin='meta instruction\n',
+            round=[
+                dict(role='HUMAN', begin='<|HUMAN|>:', end='<eoh>\n'),
+                dict(role='THOUGHTS',
+                     begin='<|Inner Thoughts|>:',
+                     generate=True,
+                     end='<eot>\n',
+                     prompt='None'),
+                dict(role='BOT', begin='<|BOT|>:', end='<eob>\n'),
+            ],
+            end='meta end',
+        ))
+        prompt = parser.parse_template(self.prompt, mode='gen')
+        target = ('meta instruction\n'
+                  'begin'
+                  '<|HUMAN|>:system msg<eoh>\n'
+                  '<|HUMAN|>:U0<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B0<eob>\n'
+                  '<|HUMAN|>:U1\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B1<eob>\n'
+                  '<|HUMAN|>:U2<eoh>\n'
+                  '<|Inner Thoughts|>:')
+        self.assertEqual(prompt, target)
+        prompt = parser.parse_template(self.prompt, mode='ppl')
+        target = ('meta instruction\n'
+                  'begin'
+                  '<|HUMAN|>:system msg<eoh>\n'
+                  '<|HUMAN|>:U0<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B0<eob>\n'
+                  '<|HUMAN|>:U1\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B1<eob>\n'
+                  '<|HUMAN|>:U2<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B2<eob>\n'
+                  'end'
+                  'meta end')
+        self.assertEqual(prompt, target)
+        # no SYSTEM role, generation in BOT
+        parser = LMTemplateParser(meta_template=dict(
+            begin='meta instruction\n',
+            round=[
+                dict(role='HUMAN', begin='<|HUMAN|>:', end='<eoh>\n'),
+                dict(role='THOUGHTS',
+                     begin='<|Inner Thoughts|>:',
+                     end='<eot>\n',
+                     prompt='None'),
+                dict(
+                    role='BOT', begin='<|BOT|>:', end='<eob>\n',
+                    generate=True),
+            ],
+            end='meta end',
+        ))
+        prompt = parser.parse_template(self.prompt, mode='gen')
+        target = ('meta instruction\n'
+                  'begin'
+                  '<|HUMAN|>:system msg<eoh>\n'
+                  '<|HUMAN|>:U0<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B0<eob>\n'
+                  '<|HUMAN|>:U1\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B1<eob>\n'
+                  '<|HUMAN|>:U2<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:')
+        self.assertEqual(prompt, target)
+        prompt = parser.parse_template(self.prompt, mode='ppl')
+        target = ('meta instruction\n'
+                  'begin'
+                  '<|HUMAN|>:system msg<eoh>\n'
+                  '<|HUMAN|>:U0<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B0<eob>\n'
+                  '<|HUMAN|>:U1\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B1<eob>\n'
+                  '<|HUMAN|>:U2<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B2<eob>\n'
+                  'end'
+                  'meta end')
+        self.assertEqual(prompt, target)
+        # with SYSTEM role, generation in BOT
+        parser = LMTemplateParser(meta_template=dict(
+            begin='meta instruction\n',
+            round=[
+                dict(role='HUMAN', begin='<|HUMAN|>:', end='<eoh>\n'),
+                dict(role='THOUGHTS',
+                     begin='<|Inner Thoughts|>:',
+                     end='<eot>\n',
+                     prompt='None'),
+                dict(
+                    role='BOT', begin='<|BOT|>:', end='<eob>\n',
+                    generate=True),
+            ],
+            end='meta end',
+            reserved_roles=[
+                dict(role='SYSTEM', begin='<|SYSTEM|>:', end='<eos>\n')
+            ]))
+        prompt = parser.parse_template(self.prompt, mode='gen')
+        target = ('meta instruction\n'
+                  'begin'
+                  '<|SYSTEM|>:system msg<eos>\n'
+                  '<|HUMAN|>:U0<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B0<eob>\n'
+                  '<|HUMAN|>:U1\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B1<eob>\n'
+                  '<|HUMAN|>:U2<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:')
+        self.assertEqual(prompt, target)
+        prompt = parser.parse_template(self.prompt, mode='ppl')
+        target = ('meta instruction\n'
+                  'begin'
+                  '<|SYSTEM|>:system msg<eos>\n'
+                  '<|HUMAN|>:U0<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B0<eob>\n'
+                  '<|HUMAN|>:U1\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B1<eob>\n'
+                  '<|HUMAN|>:U2<eoh>\n'
+                  '<|Inner Thoughts|>:None<eot>\n'
+                  '<|BOT|>:B2<eob>\n'
+                  'end'
+                  'meta end')
+        self.assertEqual(prompt, target)
--- a/tools/case_analyzer.py
+++ b/tools/case_analyzer.py
+import argparse
+import copy
+import json
+import os.path as osp
+import mmengine
+from mmengine.config import Config, ConfigDict
+from mmengine.utils import mkdir_or_exist
+from tqdm import tqdm
+from opencompass.registry import TEXT_POSTPROCESSORS
+from opencompass.utils import build_dataset_from_cfg, get_infer_output_path
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run an evaluation task')
+    parser.add_argument('config', help='Train config file path')
+    parser.add_argument(
+        '-f',
+        '--force',
+        help='Force to run the task even if the results already exist',
+        action='store_true',
+        default=False)
+    parser.add_argument('-w',
+                        '--work-dir',
+                        help='Work path, all the outputs will be '
+                        'saved in this path, including the slurm logs, '
+                        'the evaluation results, the summary results, etc.'
+                        'If not specified, the work_dir will be set to '
+                        './outputs/default.',
+                        default=None,
+                        type=str)
+    args = parser.parse_args()
+    return args
+class BadcaseShower:
+    """"""
+    def __init__(self, cfg: ConfigDict) -> None:
+        self.cfg = cfg
+        self.model_cfg = copy.deepcopy(self.cfg['model'])
+        self.dataset_cfg = copy.deepcopy(self.cfg['dataset'])
+        self.work_dir = self.cfg.get('work_dir')
+        # Load Dataset
+        self.eval_cfg = self.dataset_cfg.get('eval_cfg')
+        self.ds_split = self.eval_cfg.get('ds_split', None)
+        self.ds_column = self.eval_cfg.get('ds_column')
+    def run(self):
+        filename = get_infer_output_path(
+            self.model_cfg, self.dataset_cfg,
+            osp.join(self.work_dir, 'predictions'))
+        root, ext = osp.splitext(filename)
+        partial_filename = root + '_0' + ext
+        if not osp.exists(osp.realpath(filename)) and not osp.exists(
+                osp.realpath(partial_filename)):
+            print(f'{filename} not found')
+            return
+        dataset = build_dataset_from_cfg(self.dataset_cfg)
+        # Postprocess dataset if necessary
+        if 'dataset_postprocessor' in self.eval_cfg:
+            def postprocess(sample):
+                s = sample[self.ds_column]
+                proc = TEXT_POSTPROCESSORS.get(
+                    self.eval_cfg['dataset_postprocessor']['type'])
+                sample[self.ds_column] = proc(s)
+                return sample
+            dataset = dataset.map(postprocess)
+        # Load predictions
+        if osp.exists(osp.realpath(filename)):
+            preds = mmengine.load(filename)
+        else:
+            filename = partial_filename
+            preds, offset = {}, 0
+            i = 1
+            while osp.exists(osp.realpath(filename)):
+                _preds = mmengine.load(filename)
+                filename = root + f'_{i}' + ext
+                i += 1
+                for _o in range(len(_preds)):
+                    preds[str(offset)] = _preds[str(_o)]
+                    offset += 1
+        pred_strs = [preds[str(i)]['prediction'] for i in range(len(preds))]
+        # Postprocess predictions if necessary
+        if 'pred_postprocessor' in self.eval_cfg:
+            proc = TEXT_POSTPROCESSORS.get(
+                self.eval_cfg['pred_postprocessor']['type'])
+            pred_strs = [proc(s) for s in pred_strs]
+        if self.ds_split:
+            references = dataset[self.ds_split][self.ds_column]
+        else:
+            references = dataset[self.ds_column]
+        if len(pred_strs) != len(references):
+            print('length mismatch')
+            return
+        # combine cases
+        allcase, badcase = [], []
+        if 'in-context examples' in preds['0']:
+            # ppl eval
+            for i, (pred_str,
+                    reference) in enumerate(zip(tqdm(pred_strs), references)):
+                ref_str = str(reference)
+                try:
+                    pred_prompt = preds[str(i)]['label: ' +
+                                                pred_str]['testing input']
+                    pred_PPL = preds[str(i)]['label: ' + pred_str]['PPL']
+                    ref_prompt = preds[str(i)]['label: ' +
+                                               ref_str]['testing input']
+                    ref_PPL = preds[str(i)]['label: ' + ref_str]['PPL']
+                except KeyError:
+                    continue
+                item = {
+                    'prediction_prompt': pred_prompt,
+                    'prediction': pred_str,
+                    'prediction_PPL': pred_PPL,
+                    'reference_prompt': ref_prompt,
+                    'reference': ref_str,
+                    'reference_PPL': ref_PPL
+                }
+                if pred_str != ref_str:
+                    badcase.append(item)
+                    allcase.append(item)
+                else:
+                    allcase.append(item)
+        else:
+            # gen eval
+            for i, (pred_str,
+                    reference) in enumerate(zip(tqdm(pred_strs), references)):
+                ref_str = str(reference)
+                origin_prompt = preds[str(i)]['origin_prompt']
+                item = {
+                    'origin_prompt': origin_prompt,
+                    'prediction': pred_str,
+                    'reference': ref_str
+                }
+                # FIXME: we now consider all cases as bad cases
+                badcase.append(item)
+                allcase.append(item)
+        # Save result
+        out_path = get_infer_output_path(
+            self.cfg['model'], self.cfg['dataset'],
+            osp.join(self.work_dir, 'case_analysis/bad'))
+        mkdir_or_exist(osp.split(out_path)[0])
+        with open(out_path, 'w', encoding='utf-8') as f:
+            json.dump(badcase, f, indent=4, ensure_ascii=False)
+        out_path = get_infer_output_path(
+            self.cfg['model'], self.cfg['dataset'],
+            osp.join(self.work_dir, 'case_analysis/all'))
+        mkdir_or_exist(osp.split(out_path)[0])
+        with open(out_path, 'w', encoding='utf-8') as f:
+            json.dump(allcase, f, indent=4, ensure_ascii=False)
+def dispatch_tasks(cfg, force=False):
+    for model in cfg['models']:
+        for dataset in cfg['datasets']:
+            if force or not osp.exists(
+                    get_infer_output_path(
+                        model, dataset,
+                        osp.join(cfg['work_dir'], 'case_analysis/all'))):
+                BadcaseShower({
+                    'model': model,
+                    'dataset': dataset,
+                    'work_dir': cfg['work_dir']
+                }).run()
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    # set work_dir
+    if args.work_dir is not None:
+        cfg['work_dir'] = args.work_dir
+    else:
+        cfg.setdefault('work_dir', './outputs/default')
+    dispatch_tasks(cfg, force=args.force)
+if __name__ == '__main__':
+    main()
--- a/tools/cfg_run.py
+++ b/tools/cfg_run.py
+import argparse
+import getpass
+import os
+import os.path as osp
+from datetime import datetime
+from mmengine.config import Config
+from opencompass.registry import PARTITIONERS, RUNNERS
+from opencompass.runners import SlurmRunner
+from opencompass.utils import LarkReporter, Summarizer, get_logger
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run an evaluation task')
+    parser.add_argument('config', help='Train config file path')
+    parser.add_argument('-p',
+                        '--partition',
+                        help='Slurm partition name',
+                        default=None,
+                        type=str)
+    parser.add_argument('-q',
+                        '--quotatype',
+                        help='Slurm quota type',
+                        default='auto',
+                        type=str)
+    parser.add_argument('--debug',
+                        help='Debug mode, in which scheduler will run tasks '
+                        'in the single process, and output will not be '
+                        'redirected to files',
+                        action='store_true',
+                        default=False)
+    parser.add_argument('-m',
+                        '--mode',
+                        help='Running mode. You can choose "infer" if you '
+                        'only want the inference results, or "eval" if you '
+                        'already have the results and want to evaluate them, '
+                        'or "viz" if you want to visualize the results.',
+                        choices=['all', 'infer', 'eval', 'viz'],
+                        default='all',
+                        type=str)
+    parser.add_argument('-r',
+                        '--reuse',
+                        nargs='?',
+                        type=str,
+                        const='latest',
+                        help='Reuse previous outputs & results, and run any '
+                        'missing jobs presented in the config. If its '
+                        'argument is not specified, the latest results in '
+                        'the work_dir will be reused. The argument should '
+                        'also be a specific timestamp, e.g. 20230516_144254'),
+    parser.add_argument('-w',
+                        '--work-dir',
+                        help='Work path, all the outputs will be '
+                        'saved in this path, including the slurm logs, '
+                        'the evaluation results, the summary results, etc.'
+                        'If not specified, the work_dir will be set to '
+                        './outputs/default.',
+                        default=None,
+                        type=str)
+    parser.add_argument('-l',
+                        '--lark',
+                        help='Report the running status to lark bot',
+                        action='store_true',
+                        default=False)
+    args = parser.parse_args()
+    return args
+def main():
+    args = parse_args()
+    # initialize logger
+    logger = get_logger(log_level='DEBUG' if args.debug else 'INFO')
+    cfg = Config.fromfile(args.config)
+    if args.work_dir is not None:
+        cfg['work_dir'] = args.work_dir
+    else:
+        cfg.setdefault('work_dir', './outputs/default/')
+    # cfg_time_str defaults to the current time
+    cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
+    if args.reuse:
+        if args.reuse == 'latest':
+            dirs = os.listdir(cfg.work_dir)
+            assert len(dirs) > 0, 'No previous results to reuse!'
+            dir_time_str = sorted(dirs)[-1]
+        else:
+            dir_time_str = args.reuse
+        logger.info(f'Reusing experiements from {dir_time_str}')
+    elif args.mode in ['eval', 'viz']:
+        raise ValueError('You must specify -r or --reuse when running in eval '
+                         'or viz mode!')
+    # update "actual" work_dir
+    cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
+    os.makedirs(osp.join(cfg.work_dir, 'configs'), exist_ok=True)
+    # dump config
+    output_config_path = osp.join(cfg.work_dir, 'configs',
+                                  f'{cfg_time_str}.py')
+    cfg.dump(output_config_path)
+    # Config is intentally reloaded here to avoid initialized
+    # types cannot be serialized
+    cfg = Config.fromfile(output_config_path)
+    # infer
+    if not args.lark:
+        cfg['lark_bot_url'] = None
+    elif cfg.get('lark_bot_url', None):
+        content = f'{getpass.getuser()} 的新任务已启动！'
+        LarkReporter(cfg['lark_bot_url']).post(content)
+    if cfg.get('infer', None) is not None and args.mode in ['all', 'infer']:
+        if args.partition is not None:
+            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
+                cfg.infer.runner.partition = args.partition
+                cfg.infer.runner.quotatype = args.quotatype
+            else:
+                logger.warning('SlurmRunner is not used, so the partition '
+                               'argument is ignored.')
+        if args.debug:
+            cfg.infer.runner.debug = True
+        if args.lark:
+            cfg.infer.runner.lark_bot_url = cfg['lark_bot_url']
+        cfg.infer.partitioner['out_dir'] = osp.join(cfg['work_dir'],
+                                                    'predictions/')
+        partitioner = PARTITIONERS.build(cfg.infer.partitioner)
+        tasks = partitioner(cfg)
+        runner = RUNNERS.build(cfg.infer.runner)
+        runner(tasks)
+    # evaluate
+    if cfg.get('eval', None) is not None and args.mode in ['all', 'eval']:
+        if args.partition is not None:
+            if RUNNERS.get(cfg.infer.runner.type) == SlurmRunner:
+                cfg.eval.runner.partition = args.partition
+                cfg.eval.runner.quotatype = args.quotatype
+            else:
+                logger.warning('SlurmRunner is not used, so the partition '
+                               'argument is ignored.')
+        if args.debug:
+            cfg.eval.runner.debug = True
+        if args.lark:
+            cfg.eval.runner.lark_bot_url = cfg['lark_bot_url']
+        cfg.eval.partitioner['out_dir'] = osp.join(cfg['work_dir'], 'results/')
+        partitioner = PARTITIONERS.build(cfg.eval.partitioner)
+        tasks = partitioner(cfg)
+        runner = RUNNERS.build(cfg.eval.runner)
+        runner(tasks)
+    # visualize
+    if args.mode in ['all', 'eval', 'viz']:
+        summarizer = Summarizer(cfg)
+        summarizer.summarize(time_str=cfg_time_str)
+if __name__ == '__main__':
+    main()
--- a/tools/prompt_viewer.py
+++ b/tools/prompt_viewer.py
+import argparse
+import fnmatch
+from typing import Dict
+from mmengine.config import Config, ConfigDict
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
+from opencompass.utils import (Menu, build_dataset_from_cfg,
+                               build_model_from_cfg, dataset_abbr_from_cfg,
+                               model_abbr_from_cfg)
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run an evaluation task')
+    parser.add_argument('config', help='Train config file path')
+    parser.add_argument('-n', '--non-interactive', action='store_true')
+    parser.add_argument('-a', '--all', action='store_true')
+    parser.add_argument('-p',
+                        '--pattern',
+                        type=str,
+                        help='To match the dataset abbr.')
+    args = parser.parse_args()
+    return args
+def parse_model_cfg(model_cfg: ConfigDict) -> Dict[str, ConfigDict]:
+    model2cfg = {}
+    for model in model_cfg:
+        model2cfg[model_abbr_from_cfg(model)] = model
+    return model2cfg
+def parse_dataset_cfg(dataset_cfg: ConfigDict) -> Dict[str, ConfigDict]:
+    dataset2cfg = {}
+    for dataset in dataset_cfg:
+        dataset2cfg[dataset_abbr_from_cfg(dataset)] = dataset
+    return dataset2cfg
+def print_prompts(model_cfg, dataset_cfg):
+    # TODO: A really dirty method that copies code from PPLInferencer and
+    # GenInferencer. In the future, the prompt extraction code should be
+    # extracted and generalized as a static method in these Inferencers
+    # and reused here.
+    if model_cfg:
+        max_seq_len = model_cfg.max_seq_len
+        if not model_cfg['type'].is_api:
+            model_cfg['tokenizer_only'] = True
+        model = build_model_from_cfg(model_cfg)
+    else:
+        max_seq_len = None
+        model = None
+    infer_cfg = dataset_cfg.get('infer_cfg')
+    fix_id_list = infer_cfg.inferencer.get('fix_id_list', [])
+    dataset = build_dataset_from_cfg(dataset_cfg)
+    ice_template = None
+    if hasattr(infer_cfg, 'ice_template'):
+        ice_template = ICL_PROMPT_TEMPLATES.build(infer_cfg['ice_template'])
+    prompt_template = None
+    if hasattr(infer_cfg, 'prompt_template'):
+        prompt_template = ICL_PROMPT_TEMPLATES.build(
+            infer_cfg['prompt_template'])
+    infer_cfg['retriever']['dataset'] = dataset
+    retriever = ICL_RETRIEVERS.build(infer_cfg['retriever'])
+    if fix_id_list:
+        ice_idx_list = retriever.retrieve(fix_id_list)
+    else:
+        ice_idx_list = retriever.retrieve()
+    assert infer_cfg.inferencer.type in [PPLInferencer, GenInferencer], \
+        'Only PPLInferencer and GenInferencer are supported'
+    if infer_cfg.inferencer.type == PPLInferencer:
+        labels = retriever.get_labels(ice_template=ice_template,
+                                      prompt_template=prompt_template)
+        ice = [
+            retriever.generate_ice(ice_idx_list[idx],
+                                   ice_template=ice_template)
+            for idx in range(len(ice_idx_list))
+        ]
+        print('-' * 100)
+        print('ICE Template:')
+        print('-' * 100)
+        print(ice[0])
+        print('-' * 100)
+        for label in labels:
+            idx = 0
+            prompt = retriever.generate_label_prompt(
+                idx,
+                ice[idx],
+                label,
+                ice_template=ice_template,
+                prompt_template=prompt_template,
+                remain_sep=None)
+            if max_seq_len is not None:
+                prompt_token_num = model.get_token_len_from_template(prompt)
+                while len(ice_idx_list[idx]
+                          ) > 0 and prompt_token_num > max_seq_len:
+                    num_ice = len(ice_idx_list[idx])
+                    print(f'Truncating ice {num_ice} -> {num_ice - 1}',
+                          f'Number of tokens: {prompt_token_num} -> ...')
+                    ice_idx_list[idx] = ice_idx_list[idx][:-1]
+                    ice[idx] = retriever.generate_ice(
+                        ice_idx_list[idx], ice_template=ice_template)
+                    prompt = retriever.generate_label_prompt(
+                        idx,
+                        ice[idx],
+                        label,
+                        ice_template=ice_template,
+                        prompt_template=prompt_template)
+                    prompt_token_num = model.get_token_len_from_template(
+                        prompt)
+                print(f'Number of tokens: {prompt_token_num}')
+            if model is not None:
+                prompt = model.parse_template(prompt, mode='ppl')
+            print('-' * 100)
+            print(f'Label: {label}')
+            print('Sample prompt:')
+            print('-' * 100)
+            print(prompt)
+            print('-' * 100)
+    elif infer_cfg.inferencer.type == GenInferencer:
+        idx, ice_idx = 0, ice_idx_list[0]
+        ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
+        prompt = retriever.generate_prompt_for_generate_task(
+            idx,
+            ice,
+            gen_field_replace_token=infer_cfg.inferencer.get(
+                'gen_field_replace_token', ''),
+            ice_template=ice_template,
+            prompt_template=prompt_template)
+        if max_seq_len is not None:
+            prompt_token_num = model.get_token_len_from_template(prompt)
+            while len(ice_idx) > 0 and prompt_token_num > max_seq_len:
+                num_ice = len(ice_idx)
+                print(f'Truncating ice {num_ice} -> {num_ice - 1}',
+                      f'Number of tokens: {prompt_token_num} -> ...')
+                ice_idx = ice_idx[:-1]
+                ice = retriever.generate_ice(ice_idx,
+                                             ice_template=ice_template)
+                prompt = retriever.generate_prompt_for_generate_task(
+                    idx,
+                    ice,
+                    gen_field_replace_token=infer_cfg.inferencer.get(
+                        'gen_field_replace_token', ''),
+                    ice_template=ice_template,
+                    prompt_template=prompt_template)
+                prompt_token_num = model.get_token_len_from_template(prompt)
+            print(f'Number of tokens:  {prompt_token_num}')
+        if model is not None:
+            prompt = model.parse_template(prompt, mode='gen')
+        print('-' * 100)
+        print('Sample prompt:')
+        print('-' * 100)
+        print(prompt)
+        print('-' * 100)
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    # cfg.models =
+    model2cfg = parse_model_cfg(cfg.models) if 'models' in cfg else {
+        'None': None
+    }
+    if 'datasets' in cfg:
+        dataset2cfg = parse_dataset_cfg(cfg.datasets)
+    else:
+        dataset2cfg = {}
+        for key in cfg.keys():
+            if key.endswith('_datasets'):
+                dataset2cfg.update(parse_dataset_cfg(cfg[key]))
+    if args.pattern is not None:
+        matches = fnmatch.filter(dataset2cfg, args.pattern)
+        if len(matches) == 0:
+            raise ValueError(
+                'No dataset match the pattern. Please select from: \n' +
+                '\n'.join(dataset2cfg.keys()))
+        dataset2cfg = {k: dataset2cfg[k] for k in matches}
+    if not args.all:
+        if not args.non_interactive:
+            model, dataset = Menu(
+                [list(model2cfg.keys()),
+                 list(dataset2cfg.keys())], [
+                     f'Please make a selection of {s}:'
+                     for s in ['model', 'dataset']
+                 ]).run()
+        else:
+            model = list(model2cfg.keys())[0]
+            dataset = list(dataset2cfg.keys())[0]
+        model_cfg = model2cfg[model]
+        dataset_cfg = dataset2cfg[dataset]
+        print_prompts(model_cfg, dataset_cfg)
+    else:
+        for model_abbr, model_cfg in model2cfg.items():
+            for dataset_abbr, dataset_cfg in dataset2cfg.items():
+                print('=' * 64, '[BEGIN]', '=' * 64)
+                print(f'[MODEL]: {model_abbr}')
+                print(f'[DATASET]: {dataset_abbr}')
+                print('---')
+                print_prompts(model_cfg, dataset_cfg)
+                print('=' * 65, '[END]', '=' * 65)
+                print()
+if __name__ == '__main__':
+    main()
--- a/tools/test_api_model.py
+++ b/tools/test_api_model.py
+import argparse
+from typing import Dict
+from mmengine.config import Config, ConfigDict
+from opencompass.utils import Menu, build_model_from_cfg, model_abbr_from_cfg
+from opencompass.utils.prompt import PromptList
+test_prompts = [
+    PromptList([
+        {
+            'section': 'begin',
+            'pos': 'begin'
+        },
+        {
+            'role':
+            'SYSTEM',
+            'fallback_role':
+            'HUMAN',
+            'prompt':
+            'The following are multiple choice questions (with answers) about professional law.'  # noqa
+        },
+        '',
+        {
+            'section': 'ice',
+            'pos': 'begin'
+        },
+        {
+            'role':
+            'HUMAN',
+            'prompt':
+            "Without a warrant, police officers searched the garbage cans in the alley behind a man's house and discovered chemicals used to make methamphetamine, as well as cooking utensils and containers with the man's fingerprints on them. The alley was a public thoroughfare maintained by the city, and the garbage was picked up once a week by a private sanitation company. The items were found inside the garbage cans in plastic bags that had been tied closed and further secured with tape. The man was charged in federal court with the manufacture of methamphetamine. Did the search of the garbage cans violate the Fourth Amendment?\nA. No, because the man had no reasonable expectation of privacy in garbage left in the alley.\nB. No, because the probative value of the evidence outweighs the man's modest privacy claims in his garbage.\nC. Yes, because the alley was within the curtilage of the man's home and entry without a warrant was unconstitutional.\nD. Yes, because there is a reasonable expectation of privacy in one's secured garbage containers.\nAnswer: "  # noqa
+        },
+        {
+            'role': 'BOT',
+            'prompt': 'A\n'
+        },
+        {
+            'section': 'ice',
+            'pos': 'end'
+        },
+        {
+            'section': 'ice',
+            'pos': 'begin'
+        },
+        {
+            'role':
+            'HUMAN',
+            'prompt':
+            'A man borrowed $500,000 from a bank, securing the loan with a mortgage on a commercial building he owned. The mortgage provided as follows: "No prepayment may be made on this loan during the first two years after the date of this mortgage. Thereafter, prepayment may be made in any amount at any time but only if accompanied by a prepayment fee of 5% of the amount prepaid." One year later, the man received an unexpected cash gift of $1 million and wished to pay off the $495,000 principal balance still owed on the loan. $495,000 principal balance still owed on the loan. Concerned that the bank might refuse prepayment, despite a rise in market interest rates in the year since the loan was made, or at least insist on the 5% prepayment fee, the man consulted an attorney concerning the enforceability of the above-quoted clause. There is no applicable statute. What is the attorney likely to say? \nA. The entire clause is unenforceable, because it violates a public policy favoring the prompt and early repayment of debt.\nB. The entire clause is unenforceable, because the rise in interest rates will allow the bank to reloan the funds without loss.\nC. The two-year prepayment prohibition and the prepayment fee provision are both valid and enforceable.\nD. The two-year prepayment prohibition is unenforceable, but the prepayment fee provision is enforceable.\nAnswer: '  # noqa
+        },
+        {
+            'role': 'BOT',
+            'prompt': 'D\n'
+        },
+        {
+            'section': 'ice',
+            'pos': 'end'
+        },
+        {
+            'section': 'ice',
+            'pos': 'begin'
+        },
+        {
+            'role':
+            'HUMAN',
+            'prompt':
+            "A woman and a defendant entered into an arrangement where the woman promised to pay the defendant $10,000 to act as a surrogate mother. In return, the defendant agreed to be implanted with the woman's embryo and carry the baby to term. The woman paid the defendant the $10,000 upfront. During the seventh month of the pregnancy, the defendant changed her mind and decided to keep the child herself. The defendant moved out of state and gave birth to the baby, which she refuses to turn over to the woman. The defendant is guilty of\nA. no crime.\nB. embezzlement.\nC. kidnapping.\nD. false pretenses.\nAnswer: "  # noqa
+        },
+        {
+            'role': 'BOT',
+            'prompt': 'A\n'
+        },
+        {
+            'section': 'ice',
+            'pos': 'end'
+        },
+        {
+            'section': 'ice',
+            'pos': 'begin'
+        },
+        {
+            'role':
+            'HUMAN',
+            'prompt':
+            "A rescuer was driving on an isolated portion of a country road. His headlights caught a figure lying at the side of the road. The rescuer stopped to investigate and found a victim, who was bleeding from head wounds and appeared to have been severely beaten. The rescuer then lifted the victim into his car and drove her to the hospital, a half-hour trip. When they arrived at the hospital, the rescuer carried the victim into the emergency room. He left her with a nurse and then returned home. Although the victim recovered from her injuries, she sued the hospital for malpractice, claiming that she was not promptly given medical attention. At trial, the nurse proposes to testify that when the victim was first brought to the hospital, she was unconscious. The victim's attorney objects and moves to strike the nurse's testimony. The trial judge should\nA. sustain the objection, because it goes to an ultimate issue in the case. \nB. sustain the objection, because the nurse is not qualified to render an expert opinion. \nC. overrule the objection, because it is a shorthand rendition of what she observed. \nD. overrule the objection, because there are independent grounds to show a present sense impression. \nAnswer: "  # noqa
+        },
+        {
+            'role': 'BOT',
+            'prompt': 'C\n'
+        },
+        {
+            'section': 'ice',
+            'pos': 'end'
+        },
+        {
+            'section': 'ice',
+            'pos': 'begin'
+        },
+        {
+            'role':
+            'HUMAN',
+            'prompt':
+            "A young woman who attended a rock concert at a nightclub was injured when the band opened its performance with illegal fireworks that ignited foam insulation in the club's ceiling and walls. The young woman sued the radio station that sponsored the performance. The radio station has moved for summary judgment, claiming that it owed no duty to audience members. The evidence has established the following facts: The station advertised its sponsorship on the radio and in print, distributed free tickets to the concert, and in print, distributed free tickets to the concert, staffed the event with the station's interns to assist with crowd control, and provided a station disc jockey to serve as master of ceremonies. The master of ceremonies had the authority to stop or delay the performance at any time on the basis of any safety concern. The station knew or should have known that the band routinely used unlicensed, illegal fireworks in its performances. Should the court grant the radio station's motion for summary judgment? \nA. No, because there is sufficient evidence of knowledge and control on the part of the station to impose on it a duty of care to audience members.\nB. No, because under respondeat superior, the radio station is vicariously liable for the negligent actions of the band.\nC. Yes, because it is the band and the nightclub owners who owed audience members a duty of care.\nD. Yes, because the conduct of the band in setting off illegal fireworks was criminal and setting off illegal fireworks was criminal and was a superseding cause as a matter of law.\nAnswer: "  # noqa
+        },
+        {
+            'role': 'BOT',
+            'prompt': 'A\n'
+        },
+        {
+            'section': 'ice',
+            'pos': 'end'
+        },
+        '\n',
+        '',
+        {
+            'section': 'begin',
+            'pos': 'end'
+        },
+        {
+            'section': 'round',
+            'pos': 'begin'
+        },
+        {
+            'role':
+            'HUMAN',
+            'prompt':
+            'A state statute provides: "Whenever a person knows or should know that he (or she) is being arrested by a police officer, it is the duty of such person to refrain from using force or any weapon in resisting arrest. " Violation of the statute is made punishable by fine and/or imprisonment. One morning, there was a bank robbery in the state. That afternoon, a police officer arrested a suspect who he believed was involved in the crime. However, the police officer and the suspect have given different accounts concerning what happened next. According to the police officer, after the suspect was apprehended, he resisted arrest and hit the police officer in the mouth with his fist. The police officer, who was momentarily stunned, pulled out his nightstick and struck the suspect over the head with it. On the other hand, the suspect claimed that after he was arrested, he cursed at the policeman, whereupon the police officer began hitting the suspect with his nightstick. To avoid being hit again, the suspect hit the police officer with his fist, knocking him down. The suspect was charged with assault. The suspect should be found\nA. not guilty, if the arrest was unlawful without probable cause and the jury believes the suspect\'s account.\nB. not guilty, if the arrest was lawful, provided that the jury believes the suspect\'s account.\nC. guilty, if the arrest was lawful, regardless which account the jury believes.\nD. guilty, if the arrest was unlawful, regardless which account the jury believes.\nAnswer: '  # noqa
+        },
+        {
+            'section': 'round',
+            'pos': 'end'
+        }
+    ]),
+    'Hello! How are you?'
+]
+meta_templates = [
+    None,
+    dict(round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True)
+    ], ),
+    dict(
+        round=[
+            dict(role='HUMAN', api_role='HUMAN'),
+            dict(role='BOT', api_role='BOT', generate=True)
+        ],
+        reserved_roles=[
+            dict(role='SYSTEM', api_role='SYSTEM'),
+        ],
+    )
+]
+def test_model(model_cfg: ConfigDict):
+    for meta_template in meta_templates:
+        print('Testing meta_template: ', meta_template)
+        model_cfg['meta_template'] = meta_template
+        model = build_model_from_cfg(model_cfg)
+        print('Prompt 0 length:',
+              model.get_token_len_from_template(test_prompts[0]))
+        print('Prompt 1 length:',
+              model.get_token_len_from_template(test_prompts[1]))
+        print('Prompt lengths: ',
+              model.get_token_len_from_template(test_prompts))
+        msgs = model.generate_from_template(test_prompts, max_out_len=100)
+        print('Prompt 0 response:', msgs[0])
+        print('Prompt 1 response:', msgs[1])
+        print('-' * 100)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Test if a given API model wrapper works properly')
+    parser.add_argument('config', help='Train config file path')
+    parser.add_argument('-n', '--non-interactive', action='store_true')
+    args = parser.parse_args()
+    return args
+def parse_model_cfg(model_cfg: ConfigDict) -> Dict[str, ConfigDict]:
+    model2cfg = {}
+    for model in model_cfg:
+        model2cfg[model_abbr_from_cfg(model)] = model
+    return model2cfg
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if 'models' not in cfg:
+        raise ValueError('No "models" specified in config file!')
+    model2cfg = parse_model_cfg(cfg.models)
+    if not args.non_interactive and len(model2cfg) > 1:
+        model = Menu([list(model2cfg.keys())],
+                     ['Please make a selection of models:']).run()
+    else:
+        model = list(model2cfg.keys())[0]
+    model_cfg = model2cfg[model]
+    test_model(model_cfg)
+if __name__ == '__main__':
+    main()