同步0.2.6代码

d7117b95 · zhouxiang · 5f83e392 · d7117b95 · d7117b95 · d7117b95
Commit d7117b95 authored Mar 22, 2024 by zhouxiang
20 changed files
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-from typing import Optional
+from typing import List, Literal, Optional, Union
+
+from .archs import autoget_backend_config, get_task
+from .messages import PytorchEngineConfig, TurbomindEngineConfig
+from .model import ChatTemplateConfig


 def pipeline(model_path: str,
             model_name: Optional[str] = None,
-             instance_num: int = 32,
-             tp: int = 1,
+             backend_config: Optional[Union[TurbomindEngineConfig,
+                                            PytorchEngineConfig]] = None,
+             chat_template_config: Optional[ChatTemplateConfig] = None,
             log_level='ERROR',
             **kwargs):
    """
@@ -21,38 +26,83 @@ def pipeline(model_path: str,
                    "InternLM/internlm-chat-20b-4bit",
                    "lmdeploy/llama2-chat-70b-4bit", etc.
                - iii) The model_id of a model hosted inside a model repo
-                    on huggingface.co, such as "InternLM/internlm-chat-7b",
+                    on huggingface.co, such as "internlm/internlm-chat-7b",
                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                    and so on.
        model_name (str): needed when model_path is a pytorch model on
-            huggingface.co, such as "InternLM/internlm-chat-7b",
+            huggingface.co, such as "internlm/internlm-chat-7b",
            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
-        instance_num (int): instance numbers to be created
-        tp (int): tensor parallel
+        backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
+            config instance. Default to None.
+        chat_template_config (ChatTemplateConfig): chat template configuration.
+            Default to None.
        log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]

    Examples:
+        >>> # LLM
        >>> import lmdeploy
-        >>> pipe = lmdeploy.pipeline('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
+        >>> pipe = lmdeploy.pipeline('internlm/internlm-chat-7b')
        >>> response = pipe(['hi','say this is a test'])
        >>> print(response)
+        >>>
+        >>> # VLM
+        >>> from lmdeploy.vl import load_image
+        >>> from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
+        >>> pipe = pipeline('liuhaotian/llava-v1.5-7b',
+        ...                 backend_config=TurbomindEngineConfig(session_len=8192),
+        ...                 chat_template_config=ChatTemplateConfig(model_name='vicuna'))
+        >>> im = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
+        >>> response = pipe([('describe this image', [im])])
+        >>> print(response)
    """ # noqa E501
-    from lmdeploy.serve.async_engine import AsyncEngine
-    os.environ['TM_LOG_LEVEL'] = log_level
-    return AsyncEngine(model_path,
-                       model_name=model_name,
-                       instance_num=instance_num,
-                       tp=tp,
-                       **kwargs)
+    if os.getenv('TM_LOG_LEVEL') is None:
+        os.environ['TM_LOG_LEVEL'] = log_level
+    from lmdeploy.utils import get_logger
+    logger = get_logger('lmdeploy')
+    logger.setLevel(log_level)
+
+    pipeline_type, pipeline_class = get_task(model_path)
+    if pipeline_type == 'vlm':
+        assert (type(backend_config) is TurbomindEngineConfig) or \
+            (backend_config is None), \
+            f'{pipeline_type} model only support turbomind backend.'
+
+    if pipeline_type == 'llm' and type(
+            backend_config) is not PytorchEngineConfig:
+        # set auto backend mode
+        backend_config = autoget_backend_config(model_path, backend_config)
+    backend = 'pytorch' if type(
+        backend_config) is PytorchEngineConfig else 'turbomind'
+    logger.info(f'Using {backend} engine')
+    if 'tp' in kwargs:
+        logger.warning(
+            'The argument "tp" is deprecated and will be removed soon. '
+            'Please set "tp" in "backend_config"')
+        tp = kwargs['tp']
+        kwargs.pop('tp')
+    else:
+        tp = 1 if backend_config is None else backend_config.tp
+
+    return pipeline_class(model_path,
+                          model_name=model_name,
+                          backend=backend,
+                          backend_config=backend_config,
+                          chat_template_config=chat_template_config,
+                          tp=tp,
+                          **kwargs)


 def serve(model_path: str,
          model_name: Optional[str] = None,
+          backend: Literal['turbomind', 'pytorch'] = 'turbomind',
+          backend_config: Optional[Union[TurbomindEngineConfig,
+                                         PytorchEngineConfig]] = None,
+          chat_template_config: Optional[ChatTemplateConfig] = None,
          server_name: str = '0.0.0.0',
          server_port: int = 23333,
-          instance_num: int = 64,
-          tp: int = 1,
          log_level: str = 'ERROR',
+          api_keys: Optional[Union[List[str], str]] = None,
+          ssl: bool = False,
          **kwargs):
    """This will run the api_server in a subprocess.

@@ -67,24 +117,31 @@ def serve(model_path: str,
                    "InternLM/internlm-chat-20b-4bit",
                    "lmdeploy/llama2-chat-70b-4bit", etc.
                - iii) The model_id of a model hosted inside a model repo
-                    on huggingface.co, such as "InternLM/internlm-chat-7b",
+                    on huggingface.co, such as "internlm/internlm-chat-7b",
                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                    and so on.
        model_name (str): needed when model_path is a pytorch model on
-            huggingface.co, such as "InternLM/internlm-chat-7b",
+            huggingface.co, such as "internlm/internlm-chat-7b",
            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
+        backend (str): either `turbomind` or `pytorch` backend. Default to
+            `turbomind` backend.
+        backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
+            config instance. Default to none.
+        chat_template_config (ChatTemplateConfig): chat template configuration.
+            Default to None.
        server_name (str): host ip for serving
        server_port (int): server port
-        instance_num (int): number of instances of turbomind model
-        tp (int): tensor parallel
        log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
+        api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as
+            a single api_key. Default to None, which means no api key applied.
+        ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.

    Return:
        APIClient: A client chatbot for LLaMA series models.

    Examples:
        >>> import lmdeploy
-        >>> client = lmdeploy.serve('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
+        >>> client = lmdeploy.serve('internlm/internlm-chat-7b', 'internlm-chat-7b')
        >>> for output in client.chat('hi', 1):
        ...    print(output)
    """ # noqa E501
@@ -93,33 +150,57 @@ def serve(model_path: str,

    from lmdeploy.serve.openai.api_client import APIClient
    from lmdeploy.serve.openai.api_server import serve
+
+    if type(backend_config) is not PytorchEngineConfig:
+        # set auto backend mode
+        backend_config = autoget_backend_config(model_path, backend_config)
+    backend = 'pytorch' if type(
+        backend_config) is PytorchEngineConfig else 'turbomind'
+    if 'tp' in kwargs:
+        tp = kwargs['tp']
+        kwargs.pop('tp')
+    else:
+        tp = 1 if backend_config is None else backend_config.tp
    task = Process(target=serve,
                   args=(model_path, ),
                   kwargs=dict(model_name=model_name,
+                               backend=backend,
+                               backend_config=backend_config,
+                               chat_template_config=chat_template_config,
                               server_name=server_name,
                               server_port=server_port,
-                               instance_num=instance_num,
                               tp=tp,
                               log_level=log_level,
-                               **kwargs))
+                               api_keys=api_keys,
+                               ssl=ssl,
+                               **kwargs),
+                   daemon=True)
    task.start()
    client = APIClient(f'http://{server_name}:{server_port}')
    while True:
        time.sleep(1)
        try:
            client.available_models
+            print(
+                f'Launched the api_server in process {task.pid}, user can '
+                f'kill the server by:\nimport os,signal\nos.kill({task.pid}, '
+                'signal.SIGKILL)')
            return client
        except:  # noqa
            pass


-def client(api_server_url: str = 'http://0.0.0.0:23333', **kwargs):
+def client(api_server_url: str = 'http://0.0.0.0:23333',
+           api_key: Optional[str] = None,
+           **kwargs):
    """
    Args:
        api_server_url (str): communicating address 'http://<ip>:<port>' of
            api_server
+        api_key (str | None): api key. Default to None, which means no
+            api key will be used.
    Return:
        Chatbot for LLaMA series models with turbomind as inference engine.
    """
    from lmdeploy.serve.openai.api_client import APIClient
-    return APIClient(api_server_url, **kwargs)
+    return APIClient(api_server_url, api_key, **kwargs)
--- a/lmdeploy/cli/__init__.py
+++ b/lmdeploy/cli/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from .cli import run
+from .entrypoint import run

 __all__ = ['run']
--- a/lmdeploy/cli/chat.py
+++ b/lmdeploy/cli/chat.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
+from .cli import CLI
+from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter,
+                    convert_args, get_lora_adapters)


 class SubCliChat(object):
-    """Chat through terminal with pytorch or turbomind model."""
+    _help = 'Chat with pytorch or turbomind engine.'
+    _desc = _help
+    parser = CLI.subparsers.add_parser('chat', help=_help, description=_desc)
+    subparsers = parser.add_subparsers(
+        title='Commands', description='This group has the following commands:')

-    def torch(self,
-              model_path: str,
-              tokenizer_path: Optional[str] = None,
-              accel: Optional[str] = None,
-              max_new_tokens: int = 128,
-              temperature: float = 0.8,
-              top_p: float = 0.95,
-              seed: int = 0,
-              use_fast_tokenizer: bool = True,
-              max_alloc: int = 2048,
-              max_session_len: int = None,
-              log_file: Optional[str] = None,
-              debug: bool = False,
-              adapter: Optional[str] = None):
-        """Chat with pytorch model through terminal.
+    @staticmethod
+    def add_parser_torch():
+        """Add parser for torch command."""
+        parser = SubCliChat.subparsers.add_parser(
+            'torch',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            help=SubCliChat.torch.__doc__,
+            description=SubCliChat.torch.__doc__,
+        )
+        parser.set_defaults(run=SubCliChat.torch)
+        parser.add_argument('model_path',
+                            type=str,
+                            help='The huggingface model path')
+        # engine args
+        engine_group = parser.add_argument_group('Engine arguments')
+        ArgumentHelper.model_name(engine_group)
+        ArgumentHelper.tp(engine_group)
+        ArgumentHelper.session_len(engine_group)
+        ArgumentHelper.adapters(engine_group)
+        ArgumentHelper.cache_max_entry_count(engine_group)

-        Args:
-            model_path (str): Path to pytorch model.
-            tokenizer_path (str): Path to tokenizer.
-            accel (str): Model accelerator.
-            max_new_tokens (int): Maximum number of tokens to generate.
-            temperature (float): Temperature for sampling.
-            top_p (float): Top p for sampling.
-            seed (int): Random seed.
-            use_fast_tokenizer (bool): Whether to use fast tokenizer.
-                This argument is directly pass to transformer's
-                ``AutoTokenizer.from_pretrained``.
-                Generally, user should choose to use fast tokenizers.
-                But if using fast raise some error, try to force using a slow one.
-            max_alloc (int): Maximum memory to allocate (for deepspeed).
-            max_session_len (int): Maximum number of tokens allowed for all chat sessions.
-                This include both history and current session.
-            log_file (str): Path to log file.
-            debug (bool): Whether to enable debug mode.
-            adapter (str): Force to use an adapter.
-                Generally user should not use this argument because adapter is selected based
-                on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
-                based on `LlamaforCausalLM` class, this argument is required.
-                Currently, only "llama1" is acceptable for llama1 models.
-        """  # noqa: E501
-        from lmdeploy.pytorch.chat import main as run_torch_model
+        # other args
+        parser.add_argument('--trust-remote-code',
+                            action='store_false',
+                            default=True,
+                            help='Trust remote code')

-        run_torch_model(model_path,
-                        tokenizer_path=tokenizer_path,
-                        accel=accel,
-                        max_new_tokens=max_new_tokens,
-                        temperature=temperature,
-                        top_p=top_p,
-                        seed=seed,
-                        use_fast_tokenizer=use_fast_tokenizer,
-                        max_alloc=max_alloc,
-                        max_session_len=max_session_len,
-                        log_file=log_file,
-                        debug=debug,
-                        adapter=adapter)
+    @staticmethod
+    def add_parser_turbomind():
+        """Add parser for turbomind command."""
+        parser = SubCliChat.subparsers.add_parser(
+            'turbomind',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            help=SubCliChat.turbomind.__doc__,
+            description=SubCliChat.turbomind.__doc__,
+        )
+        parser.set_defaults(run=SubCliChat.turbomind)
+        parser.add_argument(
+            'model_path',
+            type=str,
+            help='The path of the deployed model. '
+            'It can be in format of huggingface or turbomind. '
+            'When it is turbomind model, all arguments for engine'
+            'config would be ignored, so you need to change the `config.ini`')
+        # engine arguments
+        engine_group = parser.add_argument_group('Engine arguments')
+        ArgumentHelper.tp(engine_group)
+        ArgumentHelper.model_format(engine_group)
+        ArgumentHelper.quant_policy(engine_group)
+        ArgumentHelper.model_name(engine_group)
+        ArgumentHelper.cache_max_entry_count(engine_group)
+        ArgumentHelper.rope_scaling_factor(engine_group)
+        ArgumentHelper.session_len(engine_group)
+        # other arguments
+        ArgumentHelper.cap(parser)
+        ArgumentHelper.meta_instruction(parser)  # TODO remove
+        ArgumentHelper.chat_template(parser)

-    def turbomind(self,
-                  model_path,
-                  session_id: int = 1,
-                  cap: str = 'chat',
-                  tp=1,
-                  stream_output=True,
-                  **kwargs):
-        """Chat with turbomind model through terminal.
+    @staticmethod
+    def torch(args):
+        """Chat with PyTorch inference engine through terminal."""
+        from lmdeploy.messages import PytorchEngineConfig
+        from lmdeploy.pytorch.chat import run_chat
+        adapters = get_lora_adapters(args.adapters)
+        engine_config = PytorchEngineConfig(
+            model_name=args.model_name,
+            tp=args.tp,
+            session_len=args.session_len,
+            cache_max_entry_count=args.cache_max_entry_count,
+            adapters=adapters)
+        run_chat(args.model_path,
+                 engine_config,
+                 trust_remote_code=args.trust_remote_code)

-        Args:
-            model_path (str): the path of the deployed model
-            session_id (int): the identical id of a session
-            cap (str): the capability of a model. For example, codellama has
-                the ability among ['completion', 'infilling', 'chat', 'python']
-            tp (int): GPU number used in tensor parallelism
-            stream_output (bool): indicator for streaming output or not
-            **kwarg (dict): other arguments for initializing model's chat
-                template
-        """
-        from lmdeploy.turbomind.chat import main as run_turbomind_model
+    @staticmethod
+    def turbomind(args):
+        """Chat with TurboMind inference engine through terminal."""
+        from lmdeploy.turbomind.chat import main
+        kwargs = convert_args(args)
+        from lmdeploy.model import ChatTemplateConfig
+        chat_template_config = ChatTemplateConfig(
+            model_name=args.model_name,
+            meta_instruction=args.meta_instruction,
+            capability=args.cap)
+        if args.chat_template:
+            chat_template_config = ChatTemplateConfig.from_json(
+                args.chat_template)
+        kwargs.update(dict(chat_template_cfg=chat_template_config))
+        kwargs.pop('chat_template', None)
+        main(**kwargs)

-        run_turbomind_model(model_path,
-                            session_id=session_id,
-                            cap=cap,
-                            tp=tp,
-                            stream_output=stream_output,
-                            **kwargs)
+    @staticmethod
+    def add_parsers():
+        """Add all parsers."""
+        SubCliChat.add_parser_torch()
+        SubCliChat.add_parser_turbomind()
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import argparse
 import os

-import fire
-
-from .chat import SubCliChat
-from .lite import SubCliLite
-from .serve import SubCliServe
+from ..version import __version__
+from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args


 class CLI(object):
-    """LMDeploy Command Line Interface.
-
-    The CLI provides a unified API for converting, compressing and deploying
-    large language models.
-    """
-
-    def convert(self,
-                model_name: str,
-                model_path: str,
-                model_format: str = None,
-                tokenizer_path: str = None,
-                dst_path: str = './workspace',
-                tp: int = 1,
-                quant_path: str = None,
-                group_size: int = 0,
-                **kwargs):
-        """Convert LLMs to lmdeploy format.
-
-        Args:
-            model_name (str): The name of the to-be-deployed model, such as
-                llama-7b, llama-13b, vicuna-7b and etc.
-            model_path (str): The directory path of the model or huggingface
-                repo_id like 'internlm/internlm-chat-20b'
-            model_format (str): the format of the model, should choose from
-                ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
-                format, 'hf' means huggingface llama format, and 'awq' means
-                llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
-                the default value is None, which means the model_format will be
-                inferred based on model_name
-            tokenizer_path (str): The path of tokenizer model.
-            dst_path (str): The destination path that saves outputs.
-            tp (int): The number of GPUs used for tensor parallelism, which
-                should be 2^n.
-            quant_path (str): Path of the quantized model, which can be None.
-            group_size (int): A parameter used in AWQ to quantize fp16 weights
-                to 4 bits.
-            kwargs (dict): other params for convert
-        """
-        from lmdeploy.turbomind.deploy.converter import main as convert
-
-        convert(model_name,
-                model_path,
-                model_format=model_format,
-                tokenizer_path=tokenizer_path,
-                dst_path=dst_path,
-                tp=tp,
-                quant_path=quant_path,
-                group_size=group_size,
-                **kwargs)
-
-    def list(self, engine: str = 'turbomind'):
-        """List supported model names.
-
-        Examples 1:
-            lmdeploy list
-
-        Examples 2:
-            lmdeploy list --engine pytorch
-
-        Args:
-            engine (str): The backend for the model to run. Choice from
-                ['turbomind', 'pytorch'].
-        """
-        assert engine in ['turbomind', 'pytorch']
-        if engine == 'pytorch':
-            model_names = ['llama', 'llama2', 'internlm-7b']
-        elif engine == 'turbomind':
-            from lmdeploy.model import MODELS
-            model_names = list(MODELS.module_dict.keys())
-            model_names = [n for n in model_names if n.lower() not in ['base']]
+    _desc = 'The CLI provides a unified API for converting, ' \
+            'compressing and deploying large language models.'
+    parser = argparse.ArgumentParser(prog='lmdeploy',
+                                     description=_desc,
+                                     add_help=True)
+    parser.add_argument('-v',
+                        '--version',
+                        action='version',
+                        version=__version__)
+    subparsers = parser.add_subparsers(
+        title='Commands',
+        description='lmdeploy has following commands:',
+        dest='command')
+
+    @staticmethod
+    def add_parser_convert():
+        """Add parser for convert command."""
+        parser = CLI.subparsers.add_parser(
+            'convert',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=CLI.convert.__doc__,
+            help=CLI.convert.__doc__)
+        # define arguments
+        parser.add_argument(
+            'model_name',
+            type=str,
+            help='The name of the to-be-deployed model, such as llama-7b, '
+            'llama-13b, vicuna-7b and etc. You can run `lmdeploy list` to '
+            'get the supported model names')
+        parser.add_argument('model_path',
+                            type=str,
+                            help='The directory path of the model')
+        ArgumentHelper.model_format(parser)
+        ArgumentHelper.tp(parser)
+        # other args
+        parser.add_argument('--tokenizer-path',
+                            type=str,
+                            default=None,
+                            help='The path of tokenizer model')
+        parser.add_argument('--dst-path',
+                            type=str,
+                            default='workspace',
+                            help='The destination path that saves outputs')
+        parser.add_argument(
+            '--quant-path',
+            type=str,
+            default=None,
+            help='Path of the quantized model, which can be none')
+        parser.add_argument(
+            '--group-size',
+            type=int,
+            default=0,
+            help='A parameter used in awq to quantize fp16 weights '
+            'to 4 bits')
+
+        parser.set_defaults(run=CLI.convert)
+
+    @staticmethod
+    def add_parser_list():
+        """Add parser for list command."""
+        parser = CLI.subparsers.add_parser(
+            'list',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=CLI.list.__doc__,
+            help=CLI.list.__doc__)
+        parser.set_defaults(run=CLI.list)
+        # define arguments
+        ArgumentHelper.engine(parser)
+
+    @staticmethod
+    def add_parser_checkenv():
+        """Add parser for check_env command."""
+        parser = CLI.subparsers.add_parser(
+            'check_env',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=CLI.check_env.__doc__,
+            help=CLI.check_env.__doc__)
+        parser.set_defaults(run=CLI.check_env)
+        parser.add_argument('--dump-file',
+                            type=str,
+                            default=None,
+                            help='The file path to save env info. Only '
+                            'support file format in `json`, `yml`,'
+                            ' `pkl`')
+
+    @staticmethod
+    def convert(args):
+        """Convert LLMs to turbomind format."""
+        from lmdeploy.turbomind.deploy.converter import main
+        kwargs = convert_args(args)
+        main(**kwargs)
+
+    @staticmethod
+    def list(args):
+        """List the supported model names."""
+        from lmdeploy.model import MODELS
+        model_names = list(MODELS.module_dict.keys())
+        deprecate_names = [
+            'baichuan-7b', 'baichuan2-7b', 'chatglm2-6b', 'internlm-chat-20b',
+            'internlm-chat-7b', 'internlm-chat-7b-8k', 'internlm2-1_8b',
+            'internlm-20b', 'internlm2-20b', 'internlm2-7b', 'internlm2-chat',
+            'internlm2-chat-1_8b', 'internlm2-chat-20b', 'internlm2-chat-7b',
+            'llama-2-chat', 'llama-2', 'qwen-14b', 'qwen-7b', 'solar-70b',
+            'yi-200k', 'yi-34b', 'yi-chat', 'Mistral-7B-Instruct',
+            'Mixtral-8x7B-Instruct', 'baichuan-base', 'deepseek-chat',
+            'internlm-chat'
+        ]
+        model_names = [
+            n for n in model_names if n not in deprecate_names + ['base']
+        ]
+        deprecate_names.sort()
        model_names.sort()
-        print('Supported model names:')
+        print('The older chat template name like "internlm2-7b", "qwen-7b"'
+              ' and so on are deprecated and will be removed in the future.'
+              ' The supported chat template names are:')
        print('\n'.join(model_names))

-    def check_env(self, dump_file: str = None):
-        """Check env information.
-
-        Args:
-            dump_file (str): Output file to save env info.
-        """
-
+    @staticmethod
+    def check_env(args):
+        """Check the environmental information."""
        import importlib

        import mmengine
@@ -121,19 +158,16 @@ class CLI(object):
            print(f'{k}: {v}')

        # dump to local file
+        dump_file = args.dump_file
        if dump_file is not None:
            work_dir, _ = os.path.split(dump_file)
            if work_dir:
                os.makedirs(work_dir, exist_ok=True)
            mmengine.dump(env_info, dump_file)

-
-def run():
-    """The entry point of running LMDeploy CLI."""
-
-    cli = CLI()
-    cli.lite = SubCliLite()
-    cli.chat = SubCliChat()
-    cli.serve = SubCliServe()
-
-    fire.Fire(cli, name='lmdeploy')
+    @staticmethod
+    def add_parsers():
+        """Add all parsers."""
+        CLI.add_parser_convert()
+        CLI.add_parser_list()
+        CLI.add_parser_checkenv()
--- a/lmdeploy/cli/lite.py
+++ b/lmdeploy/cli/lite.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import DictAction
+
+from .cli import CLI
+from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args


 class SubCliLite(object):
    """CLI for compressing LLMs."""
+    _help = 'Compressing and accelerating LLMs with lmdeploy.lite module'
+    _desc = _help
+    parser = CLI.subparsers.add_parser(
+        'lite',
+        help=_help,
+        description=_desc,
+    )
+    subparsers = parser.add_subparsers(
+        title='Commands', description='This group has the following commands:')

-    def auto_awq(self,
-                 model: str,
-                 work_dir: str,
-                 w_bits: int = 4,
-                 w_sym: bool = False,
-                 w_group_size: int = 128,
-                 device: str = 'cuda'):
-        """Perform weight quantization using AWQ algorithm.
+    @staticmethod
+    def add_parser_auto_awq():
+        """Add parser for auto_awq command."""
+        parser = SubCliLite.subparsers.add_parser(
+            'auto_awq',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliLite.auto_awq.__doc__,
+            help=SubCliLite.auto_awq.__doc__)
+        parser.set_defaults(run=SubCliLite.auto_awq)
+        parser.add_argument('model',
+                            type=str,
+                            help='The path of model in hf format')
+        ArgumentHelper.work_dir(parser)
+        ArgumentHelper.calib_dataset(parser)
+        ArgumentHelper.calib_samples(parser)
+        ArgumentHelper.calib_seqlen(parser)
+        ArgumentHelper.device(parser)
+        parser.add_argument('--w-bits',
+                            type=int,
+                            default=4,
+                            help='Bit number for weight quantization')
+        parser.add_argument('--w-sym',
+                            action='store_true',
+                            help='Whether to do symmetric quantization')
+        parser.add_argument(
+            '--w-group-size',
+            type=int,
+            default=128,
+            help='Group size for weight quantization statistics')

-        Args:
-            model (str): The path of model in hf format.
-            work_dir (str): The working directory to save results.
-            w_bits (int): Bit number for weight quantization.
-            w_sym (bool): Whether to do symmetric quantization.
-            w_group_size (int): Group size for weight quantization statistics.
-            device (str): Device type of running.
-        """
-        from lmdeploy.lite.apis.auto_awq import auto_awq
+    @staticmethod
+    def add_parser_calibrate():
+        """Add parser for calibrate command."""
+        parser = SubCliLite.subparsers.add_parser(
+            'calibrate',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliLite.calibrate.__doc__,
+            help=SubCliLite.calibrate.__doc__)
+        parser.set_defaults(run=SubCliLite.calibrate)
+        parser.add_argument('model',
+                            type=str,
+                            help='The name or path of the model to be loaded')
+        ArgumentHelper.work_dir(parser)
+        ArgumentHelper.calib_dataset(parser)
+        ArgumentHelper.calib_samples(parser)
+        ArgumentHelper.calib_seqlen(parser)
+        ArgumentHelper.device(parser)

-        auto_awq(model,
-                 work_dir,
-                 w_bits=w_bits,
-                 w_sym=w_sym,
-                 w_group_size=w_group_size,
-                 device=device)
+    @staticmethod
+    def add_parser_smooth_quant():
+        """Add parser for smooth_quant command."""
+        parser = SubCliLite.subparsers.add_parser(
+            'smooth_quant',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliLite.smooth_quant.__doc__,
+            help=SubCliLite.smooth_quant.__doc__)
+        parser.set_defaults(run=SubCliLite.smooth_quant)
+        parser.add_argument('model',
+                            type=str,
+                            help='The name or path of the model to be loaded')
+        parser.add_argument(
+            '--work-dir',
+            type=str,
+            default='./work_dir',
+            help='The working directory for outputs. defaults to "./work_dir"')
+        ArgumentHelper.calib_dataset(parser)
+        ArgumentHelper.calib_samples(parser)
+        ArgumentHelper.calib_seqlen(parser)
+        ArgumentHelper.device(parser)

-    def calibrate(self,
-                  model: str,
-                  calib_dataset: str = 'c4',
-                  calib_samples: int = 128,
-                  calib_seqlen: int = 2048,
-                  work_dir: str = './work_dir',
-                  device: str = 'cuda') -> None:
-        """Perform calibration on a given dataset.
+    @staticmethod
+    def add_parser_kv_qparams():
+        """Add parser for kv_qparams command."""
+        parser = SubCliLite.subparsers.add_parser(
+            'kv_qparams',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliLite.kv_qparams.__doc__,
+            help=SubCliLite.kv_qparams.__doc__)
+        parser.set_defaults(run=SubCliLite.kv_qparams)

-        Args:
-            model (str): The model to be loaded.
-            calib_dataset (str, optional): The calibration dataset name.
-                Defaults to 'c4'.
-            calib_samples (int, optional): The number of samples for
-                calibration. Defaults to 128.
-            calib_seqlen (int, optional): The sequence length for calibration.
-                Defaults to 2048.
-            work_dir (str): The working directory for outputs.
-                Defaults to './work_dir'.
-            device (str, optional): The device to be used for calculation.
-                Defaults to 'cuda'.
-        """
-        from lmdeploy.lite.apis.calibrate import calibrate
+        parser.add_argument('work_dir',
+                            type=str,
+                            help='Directory path where the stats are saved')
+        parser.add_argument('turbomind_dir',
+                            type=str,
+                            help='Directory path where to save the results')
+        parser.add_argument('--kv-bits',
+                            type=int,
+                            default=8,
+                            help='Number of bits for quantization')
+        parser.add_argument('--kv-sym',
+                            action='store_true',
+                            help='Whether to use symmetric quantizaiton')
+        parser.add_argument(
+            '--num-tp',
+            type=int,
+            default=None,
+            help='GPU number used in tensor parallelism. Should be 2^n')
+        parser.add_argument('--tm-params',
+                            nargs='*',
+                            default=None,
+                            action=DictAction,
+                            help='Used key-values pairs in xxx=yyy format'
+                            ' to update the turbomind model weights'
+                            ' config')

-        calibrate(model,
-                  calib_dataset=calib_dataset,
-                  calib_samples=calib_samples,
-                  calib_seqlen=calib_seqlen,
-                  work_dir=work_dir,
-                  device=device)
+    @staticmethod
+    def auto_awq(args):
+        """Perform weight quantization using AWQ algorithm."""
+        from lmdeploy.lite.apis.auto_awq import auto_awq
+        kwargs = convert_args(args)
+        auto_awq(**kwargs)

-    def kv_qparams(self,
-                   work_dir: str,
-                   turbomind_dir: str,
-                   kv_bits: int = 8,
-                   kv_sym: bool = False,
-                   num_tp: int = 1) -> None:
-        """Export key and value stats.
+    @staticmethod
+    def calibrate(args):
+        """Perform calibration on a given dataset."""
+        from lmdeploy.lite.apis.calibrate import calibrate
+        kwargs = convert_args(args)
+        calibrate(**kwargs)

-        Args:
-            work_dir (str): Directory path where the stats
-                are saved.
-            turbomind_dir (str): Directory path where to
-                save the results.
-            kv_bits (int, optional): Number of bits for quantization.
-                Defaults to 8.
-            kv_sym (bool, optional): Whether to use symmetric quantization.
-                Defaults to False.
-            num_tp (int, optional): Number of tensor parallelism.
-                Defaults to 1.
-        """
+    @staticmethod
+    def kv_qparams(args):
+        """Export key and value stats."""
        from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+        kwargs = convert_args(args)
+        run_kv_qparams(**kwargs)

-        run_kv_qparams(work_dir,
-                       turbomind_dir,
-                       kv_bits=kv_bits,
-                       kv_sym=kv_sym,
-                       num_tp=num_tp)
-
-    def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
-        """Convert a hugging face model to the smallest sharded one.
+    @staticmethod
+    def smooth_quant(args):
+        """Perform w8a8 quantization using SmoothQuant."""
+        from lmdeploy.lite.apis.smooth_quant import smooth_quant
+        kwargs = convert_args(args)
+        smooth_quant(**kwargs)

-        Args:
-            src_dir (str): The directory of the input HF model.
-            dst_dir (str): The directory to save new  model.
-        """
-        from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded
-        run_sharded(src_dir, dst_dir)
+    @staticmethod
+    def add_parsers():
+        """Add all parsers."""
+        SubCliLite.add_parser_auto_awq()
+        SubCliLite.add_parser_calibrate()
+        SubCliLite.add_parser_kv_qparams()
+        SubCliLite.add_parser_smooth_quant()
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Optional
+from .cli import CLI
+from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args


-class SubCliServe(object):
+class SubCliServe:
    """Serve LLMs and interact on terminal or web UI."""
+    _help = 'Serve LLMs with gradio, openai API or triton server.'
+    _desc = _help
+    parser = CLI.subparsers.add_parser(
+        'serve',
+        help=_help,
+        description=_desc,
+    )
+    subparsers = parser.add_subparsers(
+        title='Commands', description='This group has the following commands:')

-    def gradio(self,
-               model_path_or_server: str,
-               server_name: str = '0.0.0.0',
-               server_port: int = 6006,
-               batch_size: int = 32,
-               tp: int = 1,
-               **kwargs):
-        """Serve LLMs with web ui using gradio.
-
-        Example 1:
-            lmdeploy serve gradio ./workspace
-
-        Example 2:
-            lmdeploy serve gradio http://0.0.0.0:23333
-            --server_name 0.0.0.0
-            --server_port 6006
-
-        Example 3:
-            lmdeploy serve gradio ${triton_server_ip_addresss}:33337
-
-        Args:
-            model_path_or_server (str): the path of the deployed model or the
-                tritonserver URL or restful api URL. The former is for directly
-                running service with gradio. The latter is for running with
-                tritonserver by default.
-            server_name (str): the ip address of gradio server
-            server_port (int): the port of gradio server
-            batch_size (int): batch size for running Turbomind directly
-            tp (int): tensor parallel for Turbomind
-            kwargs (dict): extra params to init
-        """
+    @staticmethod
+    def add_parser_gradio():
+        """Add parser for gradio command."""
+        parser = SubCliServe.subparsers.add_parser(
+            'gradio',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliServe.gradio.__doc__,
+            help=SubCliServe.gradio.__doc__)
+        parser.set_defaults(run=SubCliServe.gradio)
+        parser.add_argument(
+            'model_path_or_server',
+            type=str,
+            help='The path of the deployed model or the tritonserver url or '
+            'restful api url. for example: - ./workspace - 0.0.0.0:23333'
+            ' - http://0.0.0.0:23333')
+        parser.add_argument('--server-name',
+                            type=str,
+                            default='0.0.0.0',
+                            help='The ip address of gradio server')
+        parser.add_argument('--server-port',
+                            type=int,
+                            default=6006,
+                            help='The port of gradio server')
+
+        # common args
+        ArgumentHelper.backend(parser)
+
+        # chat template args
+        ArgumentHelper.meta_instruction(parser)  # TODO remove
+        ArgumentHelper.chat_template(parser)
+        ArgumentHelper.cap(parser)
+
+        # pytorch engine args
+        pt_group = parser.add_argument_group('PyTorch engine arguments')
+        # common engine args
+        tp_act = ArgumentHelper.tp(pt_group)
+        model_name_act = ArgumentHelper.model_name(pt_group)
+        session_len_act = ArgumentHelper.session_len(pt_group)
+        max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
+        cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
+
+        # turbomind args
+        tb_group = parser.add_argument_group('TurboMind engine arguments')
+        # common engine args
+        tb_group._group_actions.append(tp_act)
+        tb_group._group_actions.append(model_name_act)
+        tb_group._group_actions.append(session_len_act)
+        tb_group._group_actions.append(max_batch_size_act)
+        tb_group._group_actions.append(cache_max_entry_act)
+        ArgumentHelper.model_format(tb_group)
+        ArgumentHelper.quant_policy(tb_group)
+        ArgumentHelper.rope_scaling_factor(tb_group)
+
+    @staticmethod
+    def add_parser_api_server():
+        """Add parser for api_server command."""
+        parser = SubCliServe.subparsers.add_parser(
+            'api_server',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliServe.api_server.__doc__,
+            help=SubCliServe.api_server.__doc__)
+        parser.set_defaults(run=SubCliServe.api_server)
+        parser.add_argument(
+            'model_path',
+            type=str,
+            help='The path of a model. it could be one of the following '
+            'options: - i) a local directory path of a turbomind model'
+            ' which is converted by `lmdeploy convert` command or '
+            'download from ii) and iii). - ii) the model_id of a '
+            'lmdeploy-quantized model hosted inside a model repo on '
+            'huggingface.co, such as "internlm/internlm-chat-20b-4bit",'
+            ' "lmdeploy/llama2-chat-70b-4bit", etc. - iii) the model_id'
+            ' of a model hosted inside a model repo on huggingface.co,'
+            ' such as "internlm/internlm-chat-7b", "qwen/qwen-7b-chat "'
+            ', "baichuan-inc/baichuan2-7b-chat" and so on')
+        parser.add_argument('--server-name',
+                            type=str,
+                            default='0.0.0.0',
+                            help='Host ip for serving')
+        parser.add_argument('--server-port',
+                            type=int,
+                            default=23333,
+                            help='Server port')
+        parser.add_argument('--allow-origins',
+                            nargs='+',
+                            type=str,
+                            default=['*'],
+                            help='A list of allowed origins for cors')
+        parser.add_argument('--allow-credentials',
+                            action='store_true',
+                            help='Whether to allow credentials for cors')
+        parser.add_argument('--allow-methods',
+                            nargs='+',
+                            type=str,
+                            default=['*'],
+                            help='A list of allowed http methods for cors')
+        parser.add_argument('--allow-headers',
+                            nargs='+',
+                            type=str,
+                            default=['*'],
+                            help='A list of allowed http headers for cors')
+        parser.add_argument('--qos-config-path',
+                            type=str,
+                            default='',
+                            help='Qos policy config path')
+        # common args
+        ArgumentHelper.backend(parser)
+        ArgumentHelper.log_level(parser)
+        ArgumentHelper.api_keys(parser)
+        ArgumentHelper.ssl(parser)
+
+        # chat template args
+        ArgumentHelper.meta_instruction(parser)  # TODO remove
+        ArgumentHelper.chat_template(parser)
+        ArgumentHelper.cap(parser)
+
+        # pytorch engine args
+        pt_group = parser.add_argument_group('PyTorch engine arguments')
+        # common engine args
+        tp_act = ArgumentHelper.tp(pt_group)
+        model_name_act = ArgumentHelper.model_name(pt_group)
+        session_len_act = ArgumentHelper.session_len(pt_group)
+        max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
+        cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
+
+        # turbomind args
+        tb_group = parser.add_argument_group('TurboMind engine arguments')
+        # common engine args
+        tb_group._group_actions.append(tp_act)
+        tb_group._group_actions.append(model_name_act)
+        tb_group._group_actions.append(session_len_act)
+        tb_group._group_actions.append(max_batch_size_act)
+        tb_group._group_actions.append(cache_max_entry_act)
+        ArgumentHelper.model_format(tb_group)
+        ArgumentHelper.quant_policy(tb_group)
+        ArgumentHelper.rope_scaling_factor(tb_group)
+
+    @staticmethod
+    def add_parser_api_client():
+        """Add parser for api_client command."""
+        parser = SubCliServe.subparsers.add_parser(
+            'api_client',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliServe.api_client.__doc__,
+            help=SubCliServe.api_client.__doc__)
+        parser.set_defaults(run=SubCliServe.api_client)
+        parser.add_argument('api_server_url',
+                            type=str,
+                            help='The URL of api server')
+        parser.add_argument('--api-key',
+                            type=str,
+                            default=None,
+                            help='api key. Default to None, which means no '
+                            'api key will be used')
+        ArgumentHelper.session_id(parser)
+
+    @staticmethod
+    def add_parser_triton_client():
+        """Add parser for triton_client command."""
+        parser = SubCliServe.subparsers.add_parser(
+            'triton_client',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliServe.triton_client.__doc__,
+            help=SubCliServe.triton_client.__doc__)
+        parser.set_defaults(run=SubCliServe.triton_client)
+        parser.add_argument(
+            'tritonserver_addr',
+            type=str,
+            help='The address in format "ip:port" of triton inference server')
+        ArgumentHelper.session_id(parser)
+        ArgumentHelper.cap(parser)
+        ArgumentHelper.stream_output(parser)
+
+    @staticmethod
+    def gradio(args):
+        """Serve LLMs with web UI using gradio."""
+        from lmdeploy.archs import autoget_backend
+        from lmdeploy.messages import (PytorchEngineConfig,
+                                       TurbomindEngineConfig)
+        from lmdeploy.model import ChatTemplateConfig
        from lmdeploy.serve.gradio.app import run
-        run(model_path_or_server,
-            server_name=server_name,
-            server_port=server_port,
-            batch_size=batch_size,
-            tp=tp,
-            **kwargs)
-
-    def api_server(self,
-                   model_path: str,
-                   model_name: Optional[str] = None,
-                   server_name: str = '0.0.0.0',
-                   server_port: int = 23333,
-                   instance_num: int = 64,
-                   tp: int = 1,
-                   allow_origins: List[str] = ['*'],
-                   allow_credentials: bool = True,
-                   allow_methods: List[str] = ['*'],
-                   allow_headers: List[str] = ['*'],
-                   **kwargs):
-        """Serve LLMs with restful api using fastapi.
-
-        Args:
-            model_path (str): the path of a model.
-                It could be one of the following options:
-                    - i) A local directory path of a turbomind model which is
-                        converted by `lmdeploy convert` command or
-                        download from ii) and iii).
-                    - ii) The model_id of a lmdeploy-quantized model hosted
-                        inside a model repo on huggingface.co, such as
-                        "InternLM/internlm-chat-20b-4bit",
-                        "lmdeploy/llama2-chat-70b-4bit", etc.
-                    - iii) The model_id of a model hosted inside a model repo
-                        on huggingface.co, such as "InternLM/internlm-chat-7b",
-                        "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
-                        and so on.
-            model_name (str): needed when model_path is a pytorch model on
-                huggingface.co, such as "InternLM/internlm-chat-7b"
-            server_name (str): host ip for serving
-            server_port (int): server port
-            instance_num (int): number of instances of turbomind model
-            tp (int): tensor parallel
-            allow_origins (List[str]): a list of allowed origins for CORS
-            allow_credentials (bool): whether to allow credentials for CORS
-            allow_methods (List[str]): a list of allowed HTTP methods for CORS
-            allow_headers (List[str]): a list of allowed HTTP headers for CORS
-            kwargs (dict) extra params to init api server
-        """
+        backend = args.backend
+
+        if backend != 'pytorch' and ':' not in args.model_path_or_server:
+            # set auto backend mode
+            backend = autoget_backend(args.model_path_or_server)
+        if backend == 'pytorch':
+            backend_config = PytorchEngineConfig(
+                tp=args.tp,
+                model_name=args.model_name,
+                max_batch_size=args.max_batch_size,
+                cache_max_entry_count=args.cache_max_entry_count,
+                session_len=args.session_len)
+        else:
+            backend_config = TurbomindEngineConfig(
+                model_name=args.model_name,
+                tp=args.tp,
+                max_batch_size=args.max_batch_size,
+                session_len=args.session_len,
+                model_format=args.model_format,
+                quant_policy=args.quant_policy,
+                rope_scaling_factor=args.rope_scaling_factor,
+                cache_max_entry_count=args.cache_max_entry_count)
+        chat_template_config = ChatTemplateConfig(
+            model_name=args.model_name,
+            meta_instruction=args.meta_instruction,
+            capability=args.cap)
+        if args.chat_template:
+            chat_template_config = ChatTemplateConfig.from_json(
+                args.chat_template)
+        run(args.model_path_or_server,
+            server_name=args.server_name,
+            server_port=args.server_port,
+            backend=backend,
+            backend_config=backend_config,
+            chat_template_config=chat_template_config)
+
+    @staticmethod
+    def api_server(args):
+        """Serve LLMs with restful api using fastapi."""
+        from lmdeploy.archs import autoget_backend
+        from lmdeploy.model import ChatTemplateConfig
        from lmdeploy.serve.openai.api_server import serve as run_api_server
+        backend = args.backend
+        if backend != 'pytorch':
+            # set auto backend mode
+            backend = autoget_backend(args.model_path)
+
+        if backend == 'pytorch':
+            from lmdeploy.messages import PytorchEngineConfig
+            backend_config = PytorchEngineConfig(
+                tp=args.tp,
+                model_name=args.model_name,
+                max_batch_size=args.max_batch_size,
+                cache_max_entry_count=args.cache_max_entry_count,
+                session_len=args.session_len)
+        else:
+            from lmdeploy.messages import TurbomindEngineConfig
+            backend_config = TurbomindEngineConfig(
+                model_name=args.model_name,
+                tp=args.tp,
+                max_batch_size=args.max_batch_size,
+                session_len=args.session_len,
+                model_format=args.model_format,
+                quant_policy=args.quant_policy,
+                rope_scaling_factor=args.rope_scaling_factor,
+                cache_max_entry_count=args.cache_max_entry_count)
+        chat_template_config = ChatTemplateConfig(
+            model_name=args.model_name,
+            meta_instruction=args.meta_instruction,
+            capability=args.cap)
+        if args.chat_template:
+            chat_template_config = ChatTemplateConfig.from_json(
+                args.chat_template)
+        run_api_server(args.model_path,
+                       backend=backend,
+                       backend_config=backend_config,
+                       chat_template_config=chat_template_config,
+                       server_name=args.server_name,
+                       server_port=args.server_port,
+                       allow_origins=args.allow_origins,
+                       allow_credentials=args.allow_credentials,
+                       allow_methods=args.allow_methods,
+                       allow_headers=args.allow_headers,
+                       log_level=args.log_level.upper(),
+                       api_keys=args.api_keys,
+                       ssl=args.ssl,
+                       qos_config_path=args.qos_config_path)

-        run_api_server(model_path,
-                       model_name=model_name,
-                       server_name=server_name,
-                       server_port=server_port,
-                       instance_num=instance_num,
-                       tp=tp,
-                       allow_origins=allow_origins,
-                       allow_credentials=allow_credentials,
-                       allow_methods=allow_methods,
-                       allow_headers=allow_headers,
-                       **kwargs)
-
-    def api_client(self, restful_api_url: str, session_id: int = 0):
-        """Interact with restful api server in terminal.
-
-        Args:
-            restful_api_url: The restful api URL.
-            session_id: The identical id of a session.
-        """
+    @staticmethod
+    def api_client(args):
+        """Interact with restful api server in terminal."""
        from lmdeploy.serve.openai.api_client import main as run_api_client
-        run_api_client(restful_api_url, session_id=session_id)
-
-    def triton_client(self,
-                      tritonserver_addr: str,
-                      session_id: int = 1,
-                      cap: str = 'chat',
-                      stream_output: bool = True,
-                      **kwargs):
-        """Interact with Triton Server using gRPC protocol.
-
-        Args:
-            tritonserver_addr (str): the address in format "ip:port" of
-              triton inference server
-            session_id (int): the identical id of a session
-            cap (str): the capability of a model. For example, codellama
-                has the ability among ['completion', 'infill', 'instruct',
-                'python']
-            stream_output (bool): indicator for streaming output or not
-            **kwargs (dict): other arguments for initializing model's
-                chat template
-        """
+        kwargs = convert_args(args)
+        run_api_client(**kwargs)

+    @staticmethod
+    def triton_client(args):
+        """Interact with Triton Server using gRPC protocol."""
        from lmdeploy.serve.client import main as run_triton_client
+        kwargs = convert_args(args)
+        run_triton_client(**kwargs)

-        run_triton_client(
-            tritonserver_addr,
-            session_id=session_id,
-            cap=cap,
-            stream_output=stream_output,
-            **kwargs,
-        )
+    @staticmethod
+    def add_parsers():
+        SubCliServe.add_parser_gradio()
+        SubCliServe.add_parser_api_server()
+        SubCliServe.add_parser_api_client()
+        SubCliServe.add_parser_triton_client()
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
 # Copyright (c) OpenMMLab. All rights reserved.

-from pathlib import Path
-
 import torch
 from torch import nn
-from transformers import AutoTokenizer

 from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
                                            quant_weights, smooth_layers)
-from lmdeploy.lite.utils import collect_target_modules, load_hf_from_pretrained
+from lmdeploy.lite.utils import collect_target_modules
+
+from .calibrate import calibrate

 # from lmdeploy.lite.utils.export_turbomind import export_turbomind_config

 LAYER_TYPE_MAP = {
    'InternLMForCausalLM': 'InternLMDecoderLayer',
+    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
    'QWenLMHeadModel': 'QWenBlock',
    'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
    'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
@@ -21,6 +21,7 @@ LAYER_TYPE_MAP = {
 }
 NORM_TYPE_MAP = {
    'InternLMForCausalLM': 'InternLMRMSNorm',
+    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
    'QWenLMHeadModel': 'RMSNorm',
    'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
    'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
@@ -29,30 +30,33 @@ NORM_TYPE_MAP = {


 def auto_awq(model: str,
-             work_dir: str,
+             work_dir: str = './work_dir',
+             calib_dataset: str = 'ptb',
+             calib_samples: int = 128,
+             calib_seqlen: int = 2048,
             w_bits: int = 4,
             w_sym: bool = False,
             w_group_size: int = 128,
             device: str = 'cuda'):
-
-    assert model != work_dir, '$WORK_DIR and $HF_MODEL should be different'
-    model_path = model  # noqa
-
-    # Load tokenizer and configuration
-    tokenizer = AutoTokenizer.from_pretrained(model,
-                                              use_fast=False,
-                                              trust_remote_code=True)
-
-    model = load_hf_from_pretrained(model,
-                                    torch_dtype=torch.float16,
-                                    trust_remote_code=True)
+    """Perform weight quantization using AWQ algorithm.
+
+    Args:
+        model (str): The path of model in hf format.
+        work_dir (str): The working directory to save results.
+        calib_dataset (str): The calibration dataset name.
+        calib_samples (int): The number of samples for calibration.
+        calib_seqlen (int): The sequence length for calibration.
+        w_bits (int): Bit number for weight quantization.
+        w_sym (bool): Whether to do symmetric quantization.
+        w_group_size (int): Group size for weight quantization statistics.
+        device (str): Device type of running.
+    """
+    model, tokenizer, work_dir = calibrate(model, calib_dataset, calib_samples,
+                                           calib_seqlen, work_dir, device)

    layer_type = LAYER_TYPE_MAP[type(model).__name__]
    fc2fcs = FC_FCS_MAP[layer_type]
    norm2fcs = NORM_FCS_MAP[layer_type]
-
-    work_dir = Path(work_dir)
-
    act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax']
    layers = collect_target_modules(model, layer_type)
    fcs = {}
@@ -68,11 +72,6 @@ def auto_awq(model: str,
                          safe_serialization=False)
    tokenizer.save_pretrained(work_dir)

-    # export_turbomind_config(model_name,
-    #                         model_path,
-    #                         work_dir,
-    #                         group_size=w_group_size)
-

 if __name__ == '__main__':
    import fire

--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -13,19 +13,31 @@ from lmdeploy.lite.utils import (collect_target_modules, get_calib_loaders,

 LAYER_TYPE_MAP = {
    'InternLMForCausalLM': 'InternLMDecoderLayer',
+    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
    'QWenLMHeadModel': 'QWenBlock',
    'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
    'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
    'LlamaForCausalLM': 'LlamaDecoderLayer',
 }
+
 NORM_TYPE_MAP = {
    'InternLMForCausalLM': 'InternLMRMSNorm',
+    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
    'QWenLMHeadModel': 'RMSNorm',
    'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
    'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
    'LlamaForCausalLM': 'LlamaRMSNorm',
 }

+HEAD_NAME_MAP = {
+    'InternLMForCausalLM': 'lm_head',
+    'InternLM2ForCausalLM': 'output',
+    'QWenLMHeadModel': 'lm_head',
+    'BaiChuanForCausalLM': 'lm_head',  # Baichuan 7B
+    'BaichuanForCausalLM': 'lm_head',  # Baichuan2 7B
+    'LlamaForCausalLM': 'lm_head',
+}
+

 def _prepare_for_calibrate(model: nn.Module,
                           layer_type: Union[str, type],
@@ -99,7 +111,7 @@ def _prepare_for_calibrate(model: nn.Module,


 def calibrate(model: str,
-              calib_dataset: str = 'c4',
+              calib_dataset: str = 'ptb',
              calib_samples: int = 128,
              calib_seqlen: int = 2048,
              work_dir: str = './work_dir',
@@ -110,7 +122,7 @@ def calibrate(model: str,
    Args:
        model (str): The name or path of the model to be loaded.
        calib_dataset (str, optional): The calibration dataset name.
-            Defaults to 'c4'.
+            Defaults to 'ptb'.
        calib_samples (int, optional): The number of samples for calibration.
            Defaults to 128.
        calib_seqlen (int, optional): The sequence length for calibration.
@@ -119,6 +131,11 @@ def calibrate(model: str,
            Defaults to './work_dir'.
        device (str, optional): The device to be used for calculation.
            Defaults to 'cuda'.
+
+    Returns:
+        model (nn.Module): The loaded huggingface model.
+        tokenizer : The loaded hugginface tokenizer.
+        work_dir (str): The working directory for outputs.
    """

    assert calib_dataset in ['c4', 'ptb', 'wikitext2', 'pileval'], \
@@ -152,7 +169,8 @@ def calibrate(model: str,
    layer_type = LAYER_TYPE_MAP[type(model).__name__]
    norm_type = NORM_TYPE_MAP[type(model).__name__]

-    _prepare_for_calibrate(model, layer_type, 'lm_head', device)
+    _prepare_for_calibrate(model, layer_type,
+                           HEAD_NAME_MAP[type(model).__name__], device)

    print('Loading calibrate dataset ...')
    calib_loader, _ = get_calib_loaders(calib_dataset,
@@ -179,6 +197,8 @@ def calibrate(model: str,
    work_dir.mkdir(parents=True, exist_ok=True)
    calib_ctx.export(work_dir)

+    return model, tokenizer, work_dir
+

 if __name__ == '__main__':
    import fire

--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -15,6 +15,10 @@ NORM_FCS_MAP = {
        ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
        'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
    },
+    'InternLM2DecoderLayer': {
+        'attention_norm': ['attention.wqkv'],
+        'ffn_norm': ['feed_forward.w1', 'feed_forward.w3']
+    },
    'QWenBlock': {
        'ln_1': ['attn.c_attn'],
        'ln_2': ['mlp.w1', 'mlp.w2']
@@ -34,6 +38,9 @@ FC_FCS_MAP = {
        'self_attn.v_proj': ['self_attn.o_proj'],
        'mlp.up_proj': ['mlp.down_proj']
    },
+    'InternLM2DecoderLayer': {
+        'feed_forward.w3': ['feed_forward.w2']
+    },
    'QWenBlock': {
        'attn.c_attn': ['attn.c_proj'],
        'mlp.w1': ['mlp.c_proj']
@@ -71,6 +78,13 @@ def smooth_ln_fcs(ln: torch.nn.Module,
    :return: Scales
    """
    device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
+
+    # If zeros exist within the weight of the layer norm, it becomes
+    # unnecessary to perform smooth quantization at the positions where
+    # these zeros occur.
+    zero_positions = (ln.weight == 0).nonzero(as_tuple=True)[0]
+    nonzero_positions = (ln.weight != 0).nonzero(as_tuple=True)[0]
+
    act_scales = act_scales.to(device=device, dtype=dtype)

    concat_w = torch.cat([fc.weight for fc in fcs], dim=0)
@@ -78,7 +92,11 @@ def smooth_ln_fcs(ln: torch.nn.Module,

    scales = (act_scales.pow(alpha) /
              w_scales.pow(1 - alpha)).to(device).to(dtype)
-    scales = scales / (scales.max() * scales.min()).sqrt()
+
+    scales = scales / (scales[nonzero_positions].max() *
+                       scales[nonzero_positions].min()).sqrt()
+
+    scales[zero_positions] = 1

    ln.weight.div_(scales)
    if hasattr(ln, 'bias'):
@@ -182,8 +200,8 @@ def check_awq_supported(layer_type):

 def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
    """Quantize the weights of the target model's linear layers."""
+    from lmdeploy.legacy.pytorch.modules import WeightOnlyQLinear
    from lmdeploy.lite.quantization import WeightQuantizer
-    from lmdeploy.pytorch.modules import WeightOnlyQLinear
    for name, fc in fcs.items():
        fc.to(device)
        quantizer = WeightQuantizer(bits, symmetry, 'per_group', group_size)

--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -3,6 +3,8 @@ from functools import partial
 from typing import Union

 import torch
+import transformers
+from mmengine import digit_version
 from torch import nn
 from transformers import PreTrainedTokenizer

@@ -53,7 +55,6 @@ class CalibrationContext():
        self.num_kv_heads = num_kv_heads
        self.head_dim = model.config.hidden_size // num_attn_heads
        self.model = model
-        del self.model.lm_head

        self.tokenizer = tokenizer

@@ -163,12 +164,36 @@ class CalibrationContext():

                if k_obs and v_obs:
                    batch_kwargs[i]['use_cache'] = True
-                    out = self._ori_forwards[mod](*batch_args[i],
-                                                  **batch_kwargs[i])
-                    out = list(out)
-                    key, value = out.pop(-1)
-                    k_obs.observe(key)
-                    v_obs.observe(value)
+                    version = digit_version(transformers.__version__)
+                    use_new_cache = type(mod).__name__ == 'LlamaDecoderLayer'
+                    if version > digit_version('4.36.0') and use_new_cache:
+                        from transformers.cache_utils import DynamicCache
+                        batch_kwargs[i]['past_key_value'] = DynamicCache()
+
+                        ori_idx = mod.self_attn.layer_idx
+                        mod.self_attn.layer_idx = 0
+
+                        out = self._ori_forwards[mod](*batch_args[i],
+                                                      **batch_kwargs[i])
+                        mod.self_attn.layer_idx = ori_idx
+
+                        out = list(out)
+                        cache = out.pop(-1)
+
+                        key = cache.key_cache.pop(-1)
+                        value = cache.value_cache.pop(-1)
+
+                        k_obs.observe(key)
+                        v_obs.observe(value)
+
+                    else:
+                        out = self._ori_forwards[mod](*batch_args[i],
+                                                      **batch_kwargs[i])
+                        out = list(out)
+                        key, value = out.pop(-1)
+
+                        k_obs.observe(key)
+                        v_obs.observe(value)

                    del key, value
                    torch.cuda.empty_cache()

--- a/lmdeploy/lite/utils/load.py
+++ b/lmdeploy/lite/utils/load.py
@@ -3,7 +3,7 @@
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM

-from lmdeploy.pytorch.model import LoadWoInit
+from lmdeploy.pytorch.accel import LoadNoInit


 def load_hf_from_pretrained(pretrained_model_name_or_path,
@@ -26,7 +26,7 @@ def load_hf_from_pretrained(pretrained_model_name_or_path,
    elif dtype == torch.bfloat16:
        hf_config.bf16 = True

-    with LoadWoInit():
+    with LoadNoInit():
        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path, config=hf_config, **kwargs)

--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import dataclasses
+import json
 from abc import abstractmethod
-from typing import List
+from typing import List, Literal, Optional

 from mmengine import Registry

+from lmdeploy.utils import get_logger
+
+logger = get_logger('lmdeploy')
 MODELS = Registry('model', locations=['lmdeploy.model'])


 @dataclasses.dataclass
-class SamplingParam:
-    top_p: float = 0.8
-    top_k: float = None
-    temperature: float = 0.8
-    repetition_penalty: float = 1.0
+class ChatTemplateConfig:
+    """Parameters for chat template.
+
+    Args:
+        model_name (str): the name of the deployed model. Determine which chat template will be applied.
+            All the chat template names: `lmdeploy list`
+        system (str | None): begin of the system prompt
+        meta_instruction (str | None): system prompt
+        eosys (str | None): end of the system prompt
+        user (str | None): begin of the user prompt
+        eoh (str | None): end of the user prompt
+        assistant (str | None): begin of the assistant prompt
+        eoa (str | None): end of the assistant prompt
+        capability: ('completion' | 'infilling' | 'chat' | 'python') = None
+    """  # noqa: E501
+
+    model_name: str
+    system: Optional[str] = None
+    meta_instruction: Optional[str] = None
+    eosys: Optional[str] = None
+    user: Optional[str] = None
+    eoh: Optional[str] = None
+    assistant: Optional[str] = None
+    eoa: Optional[str] = None
+    separator: Optional[str] = None
+    capability: Optional[Literal['completion', 'infilling', 'chat',
+                                 'python']] = None
+    stop_words: Optional[List[str]] = None
+
+    @property
+    def chat_template(self):
+        attrs = {
+            key: value
+            for key, value in dataclasses.asdict(self).items()
+            if value is not None
+        }
+        attrs.pop('model_name', None)
+        if self.model_name in MODELS.module_dict.keys():
+            model: BaseModel = MODELS.get(self.model_name)(**attrs)
+        else:
+            logger.warning(
+                f'Could not find {self.model_name} in registered models. '
+                f'Register {self.model_name} using the BaseChatTemplate.')
+            model = BaseChatTemplate(**attrs)
+        return model
+
+    def to_json(self, file_path=None):
+        """Convert the dataclass instance to a JSON formatted string and
+        optionally save to a file."""
+        json_str = json.dumps(dataclasses.asdict(self),
+                              ensure_ascii=False,
+                              indent=4)
+        if file_path:
+            with open(file_path, 'w', encoding='utf-8') as file:
+                file.write(json_str)
+        return json_str
+
+    @classmethod
+    def from_json(cls, file_or_string):
+        """Construct a dataclass instance from a JSON file or JSON string."""
+        try:
+            # Try to open the input_data as a file path
+            with open(file_or_string, 'r', encoding='utf-8') as file:
+                json_data = file.read()
+        except FileNotFoundError:
+            # If it's not a file path, assume it's a JSON string
+            json_data = file_or_string
+        except IOError:
+            # If it's not a file path and not a valid JSON string, raise error
+            raise ValueError(
+                'Invalid input. Must be a file path or a valid JSON string.')
+        json_data = json.loads(json_data)
+        assert json_data.get('model_name', None) is not None, \
+            'model_name is a must for json chat template.'
+        if json_data['model_name'] not in MODELS.module_dict.keys():
+            MODELS.register_module(json_data['model_name'],
+                                   module=BaseChatTemplate)
+        return cls(**json_data)


-@MODELS.register_module(name='internlm')
 @MODELS.register_module(name='llama')
 @MODELS.register_module(name='base')
 class BaseModel:
@@ -24,18 +100,10 @@ class BaseModel:

    def __init__(self,
                 session_len=2048,
-                 top_p=0.8,
-                 top_k=None,
-                 temperature=0.8,
-                 repetition_penalty=1.0,
                 capability='chat',
                 stop_words=None,
                 **kwargs):
        self.session_len = session_len
-        self.top_p = top_p
-        self.top_k = top_k
-        self.temperature = temperature
-        self.repetition_penalty = repetition_penalty
        self.stop_words = stop_words
        self.capability = capability

@@ -50,43 +118,8 @@ class BaseModel:
        Returns:
            str: the concatenated prompt
        """
-        if self.capability == 'completion':
-            return prompt
-        else:
-            return self.decorate_prompt(prompt, sequence_start)
-
-    @abstractmethod
-    def decorate_prompt(self, prompt, sequence_start):
        return prompt

-    @staticmethod
-    def _translate_messages(messages: List):
-        """Translate messages into system, user speaking list, assistant
-        speaking list.
-
-        Args:
-            messages (List): chat history
-        Returns:
-            Turple: consists of system (str), users (List[str]),
-                assistants (List[str])
-        """
-        system = None
-        users = []
-        assistants = []
-        assert isinstance(messages, List)
-        for message in messages:
-            msg_role = message['role']
-            if msg_role == 'system':
-                system = message['content']
-            elif msg_role == 'user':
-                users.append(message['content'])
-            elif msg_role == 'assistant':
-                assistants.append(message['content'])
-            else:
-                raise ValueError(f'Unknown role: {msg_role}')
-        assistants.append(None)
-        return system, users, assistants
-
    @abstractmethod
    def messages2prompt(self, messages, sequence_start=True):
        """Return the prompt that is concatenated with other elements in the
@@ -103,31 +136,40 @@ class BaseModel:
            return self.get_prompt(messages)
        # chat history processing in derived classes

-    @property
-    def sampling_param(self):
-        return SamplingParam(top_p=self.top_p,
-                             top_k=self.top_k,
-                             temperature=self.temperature,
-                             repetition_penalty=self.repetition_penalty)
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

+        Args:
+            model_path (str): the model path used for matching.
+        """
+        return None

-@MODELS.register_module(name='wizardlM')
-@MODELS.register_module(name='vicuna')
-class Vicuna(BaseModel):
-    """Chat template of vicuna model."""

-    def __init__(
-            self,
-            system="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. """,  # noqa: E501
-            user='USER',
-            assistant='ASSISTANT',
-            **kwargs):
+class BaseChatTemplate(BaseModel):
+    """Base Chat template."""
+
+    def __init__(self,
+                 system='',
+                 meta_instruction='',
+                 eosys='',
+                 user='',
+                 eoh='',
+                 assistant='',
+                 eoa='',
+                 separator='',
+                 **kwargs):
        super().__init__(**kwargs)
        self.system = system
+        self.meta_instruction = meta_instruction
        self.user = user
+        self.eoh = eoh
+        self.eoa = eoa
+        self.separator = separator
+        self.eosys = eosys
        self.assistant = assistant

-    def decorate_prompt(self, prompt, sequence_start=True):
+    def get_prompt(self, prompt, sequence_start=True):
        """Return the prompt that is concatenated with other elements in the
        chat template.

@@ -138,12 +180,20 @@ class Vicuna(BaseModel):
        Returns:
            str: the concatenated prompt
        """
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
+        if self.capability == 'completion':
+            return prompt
        if sequence_start:
-            return f'{self.system} {self.user}: {prompt} {self.assistant}: '
+            # None is different from ''
+            if self.meta_instruction is not None:
+                return f'{self.system}{self.meta_instruction}{self.eosys}' \
+                    f'{self.user}{prompt}{self.eoh}' \
+                    f'{self.assistant}'
+            else:
+                return f'{self.user}{prompt}{self.eoh}' \
+                       f'{self.assistant}'
        else:
-            return f'</s>{self.user}: {prompt} {self.assistant}: '
+            return f'{self.separator}{self.user}{prompt}{self.eoh}' \
+                   f'{self.assistant}'

    def messages2prompt(self, messages, sequence_start=True):
        """Return the prompt that is concatenated with other elements in the
@@ -156,20 +206,65 @@ class Vicuna(BaseModel):
        """
        if isinstance(messages, str):
            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        system = self.system if not system else system
-        ret = system + ' '
-        for user, assistant in zip(users, assistants):
-            if assistant:
-                ret += f'{self.user}: {user} {self.assistant}: {assistant}</s>'
-            else:
-                ret += f'{self.user}: {user} {self.assistant}: '
+        box_map = dict(user=self.user,
+                       assistant=self.assistant,
+                       system=self.system)
+        eox_map = dict(user=self.eoh,
+                       assistant=self.eoa + self.separator,
+                       system=self.eosys)
+        ret = ''
+        if self.meta_instruction is not None:
+            if len(messages) and messages[0]['role'] != 'system':
+                ret += f'{self.system}{self.meta_instruction}{self.eosys}'
+        for message in messages:
+            role = message['role']
+            content = message['content']
+            ret += f'{box_map[role]}{content}{eox_map[role]}'
+        ret += f'{self.assistant}'
        return ret


+@MODELS.register_module(name='wizardlm')
+@MODELS.register_module(name='vicuna')
+class Vicuna(BaseChatTemplate):
+    """Chat template of vicuna model."""
+
+    def __init__(
+            self,
+            meta_instruction="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""",  # noqa: E501
+            eosys=' ',
+            user='USER: ',
+            eoh=' ',
+            assistant='ASSISTANT: ',
+            eoa='</s>',
+            stop_words=['</s>'],
+            **kwargs):
+        super().__init__(meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         stop_words=stop_words,
+                         **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        if 'vicuna' in model_path.lower():
+            return 'vicuna'
+        if 'wizardlm' in model_path.lower():
+            return 'wizardlm'
+
+
 @MODELS.register_module(name='internlm-chat')
 @MODELS.register_module(name='internlm-chat-7b')
-class InternLMChat7B(BaseModel):
+@MODELS.register_module(name='internlm')
+class InternLMChat7B(BaseChatTemplate):
    """Chat template of InternLM model."""

    def __init__(
@@ -179,67 +274,36 @@ class InternLMChat7B(BaseModel):
 - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
 """,  # noqa: E501
+            eosys='\n',
            user='<|User|>:',
            eoh='\n',
-            eoa='<eoa>\n',
-            eosys='\n',
            assistant='<|Bot|>:',
+            eoa='<eoa>',
+            separator='\n',
            stop_words=['<eoa>'],
            **kwargs):
-        super().__init__(**kwargs)
-        self.system = system
-        self.meta_instruction = meta_instruction
-        self.user = user
-        self.eoh = eoh
-        self.eoa = eoa
-        self.eosys = eosys
-        self.assistant = assistant
-        self.stop_words = stop_words
-
-    def decorate_prompt(self, prompt, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
-
-        Args:
-            prompt (str): user's input prompt
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
-        """
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.system}{self.meta_instruction}{self.eosys}' \
-                   f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-        else:
-            return f'\n{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
+        super().__init__(system=system,
+                         meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         **kwargs)

-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
+            model_path (str): the model path used for matching.
        """
-
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
-        ret = ''
-        if self.meta_instruction:
-            ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
-
-        for message in messages:
-            role = message['role']
-            content = message['content']
-            ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
-        ret += f'{self.assistant}'
-        return ret
+        path = model_path.lower()
+        if all([c not in path for c in ['internlm2', '8k']]) and \
+                all([c in path for c in ['internlm', 'chat']]):
+            return 'internlm'


 @MODELS.register_module(name='internlm-chat-20b')
@@ -254,7 +318,7 @@ class InternLMChat7B8K(InternLMChat7B):


 @MODELS.register_module(name='internlm-20b')
-class InternLMBaseModel20B(BaseModel):
+class InternLMBaseModel20B(BaseChatTemplate):
    """Generation parameters of InternLM-20B-Base model."""

    def __init__(self, session_len=4096, capability='completion', **kwargs):
@@ -263,71 +327,94 @@ class InternLMBaseModel20B(BaseModel):
                         **kwargs)


+@MODELS.register_module(
+    name=['internlm2-1_8b', 'internlm2-7b', 'internlm2-20b'])
+class InternLM2BaseModel7B(BaseChatTemplate):
+    """Generation parameters of InternLM2-7B-Base model."""
+
+    def __init__(self, session_len=32768, capability='completion', **kwargs):
+        super().__init__(session_len=session_len,
+                         capability=capability,
+                         **kwargs)
+
+
+@MODELS.register_module(name=[
+    'internlm2-chat', 'internlm2-chat-1_8b', 'internlm2-chat-7b',
+    'internlm2-chat-20b'
+])
+@MODELS.register_module(name='internlm2')
+class InternLM2Chat7B(InternLMChat7B):
+    """Chat template and generation parameters of InternLM2-Chat-7B."""
+
+    def __init__(self,
+                 session_len=32768,
+                 system='<|im_start|>system\n',
+                 user='<|im_start|>user\n',
+                 assistant='<|im_start|>assistant\n',
+                 eosys='<|im_end|>\n',
+                 eoh='<|im_end|>\n',
+                 eoa='<|im_end|>',
+                 separator='\n',
+                 stop_words=['<|im_end|>', '<|action_end|>'],
+                 **kwargs):
+        super(InternLM2Chat7B, self).__init__(session_len=session_len,
+                                              system=system,
+                                              user=user,
+                                              assistant=assistant,
+                                              eosys=eosys,
+                                              eoh=eoh,
+                                              eoa=eoa,
+                                              separator=separator,
+                                              stop_words=stop_words,
+                                              **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'internlm2' in path and ('chat' in path or 'math' in path):
+            return 'internlm2'
+
+
 @MODELS.register_module(name='baichuan-7b')
-class Baichuan7B(BaseModel):
+@MODELS.register_module(name='baichuan-base')
+class Baichuan7B(BaseChatTemplate):
    """Generation parameters of Baichuan-7B base model."""

-    def __init__(self, repetition_penalty=1.1, **kwargs):
+    def __init__(self, **kwargs):
        super().__init__(**kwargs)
-        self.repetition_penalty = repetition_penalty


 @MODELS.register_module(name='baichuan2-7b')
-class Baichuan2_7B(BaseModel):
+@MODELS.register_module(name='baichuan2')
+class Baichuan2_7B(BaseChatTemplate):
    """Chat template and generation parameters of Baichuan2-7B-Base and
    Baichuan2-7B-Chat models."""

    def __init__(self,
-                 temperature=0.3,
-                 top_k=5,
-                 top_p=0.85,
-                 repetition_penalty=1.05,
+                 user='<reserved_106>',
+                 assistant='<reserved_107>',
                 **kwargs):
-        super().__init__(temperature=temperature,
-                         top_k=top_k,
-                         top_p=top_p,
-                         repetition_penalty=repetition_penalty,
-                         **kwargs)
-        self.user_token = '<reserved_106>'  # id = 195
-        self.assistant_token = '<reserved_107>'  # id = 196
+        super().__init__(user=user, assistant=assistant, **kwargs)

-    def decorate_prompt(self, prompt, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

        Args:
-            prompt (str): user's input prompt
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
+            model_path (str): the model path used for matching.
        """
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        return f'{self.user_token}{prompt}{self.assistant_token}'
-
-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
-
-        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
-        """
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        ret = ''
-        for user, assistant in zip(users, assistants):
-            ret += f'{self.user_token}{user}{self.assistant_token}'
-            if assistant:
-                ret += f'{assistant}'
-        return ret
+        path = model_path.lower()
+        if 'baichuan2' in path and 'chat' in path:
+            return 'baichuan2'


 @MODELS.register_module(name='puyu')
-class Puyu(BaseModel):
+class Puyu(BaseChatTemplate):
    """Chat template of puyu model.This is only for internal usage in Shanghai
    AI Laboratory."""

@@ -341,217 +428,136 @@ class Puyu(BaseModel):
                 eoa='',
                 stop_words=None,
                 **kwargs):
-        super().__init__(**kwargs)
-        self.meta_instruction = meta_instruction
-        self.system = system
-        self.user = user
-        self.assistant = assistant
-        self.stop_words = stop_words
-        self.eosys = eosys
-        self.eoh = eoh
-        self.eoa = eoa
-
-    def decorate_prompt(self, prompt, sequence_start=True):
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.system}{self.meta_instruction}{self.eosys}' \
-                   f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-        else:
-            return f'{self.eoa}{self.user}{prompt}{self.eoh}{self.assistant}'
+        super().__init__(meta_instruction=meta_instruction,
+                         system=system,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         stop_words=stop_words,
+                         **kwargs)

-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

        Args:
-            messages (str | List): user's input prompt
-            sequence_start (bool): flag to start the sequence
-        Returns:
-            str: the concatenated prompt
+            model_path (str): the model path used for matching.
        """
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
-        ret = ''
-        if self.meta_instruction:
-            ret += f'{self.system}{self.meta_instruction}{self.eosys}'
-
-        for message in messages:
-            role = message['role']
-            content = message['content']
-            ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
-        ret += f'{self.assistant}'
-        return ret
+        if 'puyu' in model_path.lower():
+            return 'puyu'


-@MODELS.register_module(name='llama2')
-class Llama2(BaseModel):
+@MODELS.register_module(name=['llama2', 'llama-2', 'llama-2-chat'])
+class Llama2(BaseChatTemplate):
    """Chat template of LLaMA2 model."""

    def __init__(
            self,
-            b_inst='[INST]',
-            e_inst='[/INST]',
-            b_sys='<<SYS>>\n',
-            e_sys='\n<</SYS>>\n\n',
-            system="""\
+            system='[INST] <<SYS>>\n',
+            meta_instruction="""\
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",  # noqa: E501
+            eosys='\n<</SYS>>\n\n',
+            assistant=' [/INST] ',
+            eoa='</s>',
+            separator='<s>[INST] ',
            session_len=4096,
            **kwargs):
-        super().__init__(**kwargs)
-        self.b_inst = b_inst
-        self.e_inst = e_inst
-        self.b_sys = b_sys
-        self.e_sys = e_sys
-        self.default_sys_prompt = system
-        self.session_len = session_len
+        super().__init__(system=system,
+                         meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         session_len=session_len,
+                         **kwargs)

-    def decorate_prompt(self, prompt, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

        Args:
-            prompt (str): user's input prompt
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
+            model_path (str): the model path used for matching.
        """
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.b_inst} ' \
-                   f'{self.b_sys} {self.default_sys_prompt} {self.e_sys}' \
-                   f'{prompt} {self.e_inst} '
-
-        return f'{self.b_inst} {prompt} {self.e_inst} '
+        if 'llama-2' in model_path.lower() or 'llama2' in model_path.lower():
+            return 'llama2'

-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
-
-        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
-        """
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        system = self.default_sys_prompt if not system else system
-        ret = f'{self.b_inst} {self.b_sys} {system} {self.e_sys}'
-        for i, (user, assistant) in enumerate(zip(users, assistants)):
-            if i != 0:
-                ret += f'{self.b_inst} '
-            if assistant:
-                ret += f'{user} {self.e_inst} {assistant}'
-            else:
-                ret += f'{user} {self.e_inst} '
-        return ret

 @MODELS.register_module(name='qwen-72b')
 @MODELS.register_module(name='qwen-14b')
 @MODELS.register_module(name='qwen-7b')
-class Qwen7BChat(BaseModel):
+@MODELS.register_module(name='qwen')
+class Qwen7BChat(BaseChatTemplate):
    """Chat template for Qwen-7B-Chat."""

    def __init__(self,
                 session_len=8192,
-                 top_p=0.5,
-                 top_k=40,
-                 temperature=1.0,
-                 im_start='<|im_start|>',
-                 im_end='<|im_end|>',
-                 system='You are a helpful assistant.',
+                 system='<|im_start|>system\n',
+                 meta_instruction='You are a helpful assistant.',
+                 eosys='<|im_end|>\n',
+                 user='<|im_start|>user\n',
+                 eoh='<|im_end|>\n',
+                 assistant='<|im_start|>assistant\n',
+                 eoa='<|im_end|>',
+                 separator='\n',
                 stop_words=['<|im_end|>'],
                 **kwargs):
-        super().__init__(**kwargs)
-        self.session_len = session_len
-        self.top_p = top_p
-        self.top_k = top_k
-        self.temperature = temperature
-
-        self.im_start = im_start
-        self.im_end = im_end
-        self.system = system
-        self.stop_words = stop_words
-
-    def decorate_prompt(self, prompt, sequence_start=True):
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.im_start}system\n{self.system}{self.im_end}' \
-                   f'\n{self.im_start}user\n{prompt}{self.im_end}' \
-                   f'\n{self.im_start}assistant\n'
-
-        return f'\n{self.im_start}user\n{prompt}{self.im_end}' \
-               f'\n{self.im_start}assistant\n'
+        super().__init__(system=system,
+                         meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         session_len=session_len,
+                         **kwargs)

-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
+            model_path (str): the model path used for matching.
        """
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        system = self.system if not system else system
-        ret = f'{self.im_start}system\n{system}{self.im_end}'
-        for user, assistant in zip(users, assistants):
-            if assistant:
-                ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
-                       f'\n{self.im_start}assistant\n{assistant}'
-            else:
-                ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
-                       f'\n{self.im_start}assistant\n'
-        return ret
+        if 'qwen' in model_path.lower():
+            return 'qwen'


 @MODELS.register_module(name='codellama')
 class CodeLlama(Llama2):

    def __init__(self,
-                 system='',
+                 meta_instruction='',
                 session_len=4096,
                 suffix_first=False,
                 stop_words=None,
                 **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(meta_instruction=meta_instruction,
+                         session_len=session_len,
+                         stop_words=stop_words,
+                         **kwargs)
        caps = ['completion', 'infilling', 'chat', 'python']
        assert self.capability in caps, \
            f'{self.capability} is not supported. ' \
            f'The supported capabilities are: {caps}'
-        self.default_sys_prompt = system
+        self.meta_instruction = meta_instruction
        self.session_len = session_len
        self.suffix_first = suffix_first
        self.stop_words = stop_words
-
-        # The following sampling parameters refers to https://github.com/facebookresearch/codellama # noqa: E501
-        if self.capability == 'completion' or self.capability == 'python':
-            self.top_p = kwargs.get('top_p', 0.9)
-            self.temperature = kwargs.get('temperature', 0.2)
-        if self.capability == 'chat':
-            self.top_p = kwargs.get('top_p', 0.95)
-            self.temperature = kwargs.get('temperature', 0.2)
-        elif self.capability == 'infilling':
-            self.top_p = kwargs.get('top_p', 0.9)
-            self.temperature = kwargs.get('temperature', 0.0)
+        if self.capability == 'infilling':
            if self.stop_words is None:
                self.stop_words = ['<EOT>']

-    def decorate_prompt(self, prompt, sequence_start=True):
+    def get_prompt(self, prompt, sequence_start=True):
        if self.capability == 'infilling':
            return self._infill_prompt(prompt)
        elif self.capability == 'chat':
-            return self._get_prompt(prompt, sequence_start)
+            return super().get_prompt(prompt, sequence_start)
        else:  # python speicalist
            return prompt

@@ -565,92 +571,130 @@ class CodeLlama(Llama2):
            prompt = f'<PRE> {prefix} <SUF>{suffix} <MID>'
        return prompt

-    def _get_prompt(self, prompt, sequence_start):
-        prompt = prompt.strip()
-        if sequence_start:
-            return f'{self.b_inst} ' \
-                   f'{self.b_sys}{self.default_sys_prompt}{self.e_sys}' \
-                   f'{prompt} {self.e_inst}'
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        if 'codellama' in model_path.lower():
+            return 'codellama'
+
+
+@MODELS.register_module(name='falcon')
+class Falcon(BaseModel):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        if 'falcon' in model_path.lower():
+            return 'falcon'
+
+
+@MODELS.register_module(name='chatglm2-6b')
+@MODELS.register_module(name='chatglm')
+class ChatGLM2(BaseModel):
+
+    def __init__(self,
+                 user='问：',
+                 eoh='\n\n',
+                 assistant='答：',
+                 eoa='\n\n',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self._user = user
+        self._assistant = assistant
+        self._eoh = eoh
+        self._eoa = eoa
+        self.count = 0

-        return f'{self.b_inst} {prompt} {self.e_inst}'
+    def get_prompt(self, prompt, sequence_start=True):
+        """get prompt."""
+        # need more check
+        # https://github.com/THUDM/ChatGLM2-6B/issues/48
+        # [64790, 64792] to be prepended
+        self.count += 1
+        ret = f'[Round {self.count}]\n\n'
+        ret += f'{self._user}{prompt}{self._eoh}'
+        ret += f'{self._assistant}'
+        return ret

    def messages2prompt(self, messages, sequence_start=True):
-        assert self.capability == 'chat', \
-            f'codellama message2prompt only supports chat mode ' \
-            f'but got {self.cap} mode'
-        return super().messages2prompt(messages, sequence_start)
+        """message to prompt."""
+        if isinstance(messages, str):
+            return self.get_prompt(messages, sequence_start)
+        ret = ''
+        count = 0
+        for message in messages:
+            role = message['role']
+            content = message['content']
+            if role == 'user':
+                count += 1
+                ret += f'[Round {count}]\n\n'
+                ret += f'{self._user}{content}{self._eoh}'
+                ret += f'{self._assistant}'
+            if role == 'assistant':
+                ret += f'{content}'
+        return ret
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        if 'chatglm' in model_path.lower():
+            return 'chatglm'


-@MODELS.register_module(name='solar')
-class SOLAR(BaseModel):
+@MODELS.register_module(name=['solar', 'solar-70b'])
+class SOLAR(BaseChatTemplate):
    """Chat template of SOLAR model.

    `https://huggingface.co/upstage/SOLAR-0-70b-16bit`
    """

    def __init__(self,
-                 b_sys='### System:\n',
-                 e_sys='\n\n',
+                 system='### System:\n',
+                 eosys='\n\n',
                 user='### User:\n',
                 eoh='\n\n',
                 assistant='### Assistant:\n',
-                 eoa='\n\n',
-                 system='',
+                 meta_instruction='',
                 session_len=2048,
                 **kwargs):
        super().__init__(**kwargs)
-        self.b_sys = b_sys
-        self.e_sys = e_sys
+        self.system = system
+        self.eosys = eosys
        self.user = user
        self.eoh = eoh
        self.assistant = assistant
-        self.eoa = eoa
-        self.system = system
+        self.meta_instruction = meta_instruction
        self.session_len = session_len

-    def decorate_prompt(self, prompt, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

        Args:
-            prompt (str): user's input prompt
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
+            model_path (str): the model path used for matching.
        """
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.b_sys}{self.system}{self.e_sys}' \
-                   f'{self.user}{prompt}{self.eoh}{self.assistant}'
-
-        return f'{self.user}{prompt}{self.eoh}{self.assistant}'
-
-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
-
-        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
-        """
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        system = self.system if not system else system
-        ret = f'{self.b_sys}{system}{self.e_sys}'
-        for i, (user, assistant) in enumerate(zip(users, assistants)):
-            ret += f'{self.user}{user}{self.eoh}{self.assistant}'
-            if assistant:
-                ret += f'{assistant}{self.eoa}'
-        return ret
+        if 'solar' in model_path.lower():
+            return 'solar'


 @MODELS.register_module(name='ultracm')
 @MODELS.register_module(name='ultralm')
-class UltraChat(BaseModel):
+class UltraChat(BaseChatTemplate):
    """Template of UltraCM and UltraLM models.

    `https://huggingface.co/openbmb/UltraCM-13b`
@@ -659,147 +703,222 @@ class UltraChat(BaseModel):

    def __init__(
            self,
-            system="""User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.</s>""",  # noqa: E501
-            eos='</s>',
+            system='User: ',
+            meta_instruction="""A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.""",  # noqa: E501
+            eosys='</s>\n',
            user='User: ',
+            eoh='</s>\n',
            assistant='Assistant: ',
+            eoa='</s>',
+            separator='\n',
+            stop_words=['</s>'],
            session_len=2048,
            **kwargs):
-        super().__init__(**kwargs)
-        self.system = system
-        self.eos = eos
-        self.session_len = session_len
-        self.user = user
-        self.assistant = assistant
-
-    def decorate_prompt(self, prompt, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
-
-        Args:
-            prompt (str): the input prompt
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
-        """
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.system}\n{self.user}{prompt}{self.eos}' \
-                   f'\n{self.assistant}'
-
-        return f'\n{self.user}{prompt}{self.eos}' \
-               f'\n{self.assistant}'
+        super().__init__(system=system,
+                         meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         session_len=session_len,
+                         **kwargs)

-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template. Only evaluate the last instruction completion pair.
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
+            model_path (str): the model path used for matching.
        """
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        system = self.system if not system else system
-        ret = f'{system}'
-        for user, assistant in zip(users, assistants):
-            if assistant:
-                ret += f'\n{self.user}{user}{self.eos}' \
-                       f'\n{self.assistant}{assistant}{self.eos}'
-            else:
-                ret += f'\n{self.user}{user}{self.eos}' \
-                       f'\n{self.assistant}'
-        return ret
+        if 'ultracm' in model_path.lower():
+            return 'ultracm'
+        if 'ultralm' in model_path.lower():
+            return 'ultralm'


-@MODELS.register_module(name='yi')
-class Yi(BaseModel):
+@MODELS.register_module(name=['yi', 'yi-chat', 'yi-200k', 'yi-34b'])
+class Yi(BaseChatTemplate):
    """Chat template of Yi model."""

    def __init__(self,
                 system='<|im_start|>system\n',
                 meta_instruction=None,
+                 eosys='<|im_end|>\n',
                 user='<|im_start|>user\n',
                 eoh='<|im_end|>\n',
-                 eoa='<|im_end|>\n',
-                 eosys='<|im_end|>\n',
                 assistant='<|im_start|>assistant\n',
+                 eoa='<|im_end|>',
+                 separator='\n',
                 stop_words=['<|im_end|>', '<|endoftext|>'],
                 **kwargs):
-        super().__init__(**kwargs)
-        self.system = system
-        self.meta_instruction = meta_instruction
-        self.user = user
-        self.eoh = eoh
-        self.eoa = eoa
-        self.eosys = eosys
-        self.assistant = assistant
-        self.stop_words = stop_words
+        super().__init__(system=system,
+                         meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         **kwargs)

-    def decorate_prompt(self, prompt, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

        Args:
-            prompt (str): user's input prompt
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
+            model_path (str): the model path used for matching.
        """
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            if self.meta_instruction is None:
-                return f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-            return f'{self.system}{self.meta_instruction}{self.eosys}' \
-                   f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-        else:
-            return f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
+        path = model_path.lower()
+        if 'yi' in path and 'vl' not in path:
+            return 'yi'

-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
+
+@MODELS.register_module(name=['mistral', 'mixtral'])
+@MODELS.register_module(name=['Mistral-7B-Instruct', 'Mixtral-8x7B-Instruct'])
+class MistralChat(BaseChatTemplate):
+    """Template of Mistral and Mixtral Instruct models.
+
+    `https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1`
+    `https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1`
+    """
+
+    def __init__(self,
+                 user='[INST] ',
+                 eoh=' [/INST]',
+                 eoa='</s>',
+                 session_len=2048,
+                 **kwargs):
+        super().__init__(user=user,
+                         eoh=eoh,
+                         eoa=eoa,
+                         session_len=session_len,
+                         **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
+            model_path (str): the model path used for matching.
        """
+        if 'instruct' in model_path.lower():
+            if 'mistral' in model_path.lower():
+                return 'mistral'
+            if 'mixtral' in model_path.lower():
+                return 'mixtral'

-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
-        ret = ''
-        if self.meta_instruction:
-            ret += f'{self.system}:{self.meta_instruction}{self.eosys}'

-        for message in messages:
-            role = message['role']
-            content = message['content']
-            ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
-        ret += f'{self.assistant}'
-        return ret
+@MODELS.register_module(name=['gemma'])
+class Gemma(BaseChatTemplate):
+    """Template of Gemma models.
+
+    `https://huggingface.co/google/gemma-7b-it`
+    """
+
+    def __init__(self,
+                 user='<start_of_turn>user\n',
+                 eoh='<end_of_turn>\n',
+                 assistant='<start_of_turn>model\n',
+                 eoa='<end_of_turn>\n',
+                 **kwargs):
+        super().__init__(user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        if 'gemma' in model_path.lower():
+            return 'gemma'


-def main(model_name: str = 'test'):
-    assert model_name in MODELS.module_dict.keys(), \
-        f"'{model_name}' is not supported. " \
-        f'The supported models are: {MODELS.module_dict.keys()}'
-    model = MODELS.get(model_name)()
-    prompt = model.get_prompt(prompt='hi')
-    print(prompt)
-    print(f'session_len: {model.session_len}')
+@MODELS.register_module(name=['deepseek-chat'])
+@MODELS.register_module(name=['deepseek'])
+class Deepseek(BaseChatTemplate):

+    def __init__(self,
+                 user='User: ',
+                 eoh='\n\n',
+                 assistant='Assistant: ',
+                 eoa='<｜end▁of▁sentence｜>',
+                 **kwargs):
+        super().__init__(user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         **kwargs)

-if __name__ == '__main__':
-    import fire
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.

-    fire.Fire(main)
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'deepseek' in path and 'chat' in path:
+            return 'deepseek'
+
+
+@MODELS.register_module(name=['yi-vl'])
+class YiVL(BaseChatTemplate):
+
+    def __init__(
+            self,
+            meta_instruction="""This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers. 这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n""",  # noqa: E501
+            user='### Human: ',
+            eoh='\n',
+            assistant='### Assistant:',
+            eoa='\n',
+            stop_words=['###'],
+            **kwargs):
+        super().__init__(meta_instruction=meta_instruction,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         stop_words=stop_words,
+                         **kwargs)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'yi-vl' in path:
+            return 'yi-vl'
+
+
+def best_match_model(query: str) -> Optional[str]:
+    """Get the model that matches the query.
+
+    Args:
+        query (str): the input query. Could be a model path.
+
+    Return:
+        str | None: the possible model name or none.
+    """
+    for name, model in MODELS.module_dict.items():
+        if model.match(query):
+            return model.match(query)
+    try:
+        from transformers import AutoTokenizer
+        tokenizer_config = AutoTokenizer.from_pretrained(
+            query, trust_remote_code=True)
+        if tokenizer_config.chat_template is None:
+            return 'base'
+    except Exception as e:
+        assert type(e) == OSError
--- a/lmdeploy/pytorch/__init__.py
+++ b/lmdeploy/pytorch/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-"""Chat with torch models."""
--- a/lmdeploy/pytorch/accel.py
+++ b/lmdeploy/pytorch/accel.py
@@ -13,6 +13,7 @@ class LoadNoInit:
        self.normal_ = torch.nn.init.normal_
        self.kaiming_uniform_ = torch.nn.init.kaiming_uniform_
        self.kaiming_normal_ = torch.nn.init.kaiming_normal_
+        self.tensor_normal_ = torch.Tensor.normal_

    def __enter__(self, *args, **kwargs):
        """Replace initializers with no-op."""
@@ -24,6 +25,7 @@ class LoadNoInit:
        torch.nn.init.normal_ = lambda *args, **kwargs: None
        torch.nn.init.kaiming_uniform_ = lambda *args, **kwargs: None
        torch.nn.init.kaiming_normal_ = lambda *args, **kwargs: None
+        torch.Tensor.normal_ = lambda *args, **kwargs: None

    def __exit__(self, *args, **kwargs):
        """Recover."""
@@ -35,3 +37,4 @@ class LoadNoInit:
        torch.nn.init.normal_ = self.normal_
        torch.nn.init.kaiming_uniform_ = self.kaiming_uniform_
        torch.nn.init.kaiming_normal_ = self.kaiming_normal_
+        torch.Tensor.normal_ = self.tensor_normal_
--- a/lmdeploy/pytorch/adapters/__init__.py
+++ b/lmdeploy/pytorch/adapters/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import logging
-
-import torch.nn as nn
-
-from .base import BasicAdapter, BasicAdapterFast
-from .internlm import InternLMAdapter
-from .llama2 import Llama2Adapter
-
-logger = logging.getLogger(__name__)
-
-
-def _get_default_adapter(tokenizer):
-    if tokenizer.is_fast:
-        return BasicAdapterFast
-    else:
-        return BasicAdapter
-
-
-def init_adapter(model: nn.Module, tokenizer, adapter=None):
-    if adapter is None:
-        for v in model.modules():
-            if 'InternLMModel' in v.__class__.__name__:
-                Adapter = InternLMAdapter
-                break
-            elif 'LlamaModel' in v.__class__.__name__:
-                Adapter = Llama2Adapter
-                break
-        else:
-            Adapter = _get_default_adapter(tokenizer)
-    elif adapter == 'llama1':
-        Adapter = _get_default_adapter(tokenizer)
-    else:
-        raise ValueError(f'Adapter {adapter} is not allowed.')
-
-    logger.info(f'Using adapter {Adapter.__name__}')
-
-    return Adapter(tokenizer)
--- a/lmdeploy/pytorch/adapters/base.py
+++ b/lmdeploy/pytorch/adapters/base.py
-# Copyright (c) OpenMMLab. All rights reserved.
-"""Basic adapter suitable for general HuggingFace models."""
-
-import logging
-import re
-
-from transformers import (PreTrainedTokenizer, PreTrainedTokenizerBase,
-                          PreTrainedTokenizerFast)
-
-logger = logging.getLogger(__name__)
-
-
-class BaseAdapter:
-    """Base class for all adapters.
-
-    Note:
-        Adapters coordinate with the session manager to prepare input_ids.
-        The full sequence fed to the model is as follows:
-
-            ```
-            adapter.start_ids
-            adapter.encode_and_decorate(user_input_1)
-            output_1_generated_by_model
-            adapter.sep_ids
-            adapter.encode_and_decorate(user_input_2)
-            output_2_generated_by_model
-            adapter.sep_ids
-            adapter.encode_and_decorate(user_input_3)
-            ```
-
-        Thus adapter is responsible for providing model specific
-        ``start_ids``, ``sep_ids``, and method to encode single prompt.
-    """
-
-    def __init__(self, tokenizer: PreTrainedTokenizerBase):
-        self.tokenizer = tokenizer
-
-    def encode_and_decorate(self, prompt, add_special_tokens=False):
-        """Model specific method to encode and decorate prompt."""
-        raise NotImplementedError
-
-    def decode(self, value):
-        """Model specific method to decode single value to string."""
-        raise NotImplementedError
-
-    @property
-    def stopping_criteria(self):
-        """Model specific stopping criteria for generation."""
-        return None
-
-    @property
-    def start_ids(self):
-        """Model specific start ids."""
-        return [self.tokenizer.bos_token_id]
-
-    @property
-    def sep_ids(self):
-        """Model specific separation ids."""
-        return [self.tokenizer.bos_token_id]
-
-
-class BasicAdapter(BaseAdapter):
-    """Basic adapter for slow tokenizers."""
-
-    def encode_and_decorate(self, prompt, add_special_tokens=False):
-        """Encode prompt.
-
-        Note:
-            we leave <bos> to session manager to add.
-        """
-        input_ids = self.tokenizer.encode(
-            prompt,
-            add_special_tokens=add_special_tokens,
-            return_tensors='pt',
-        )
-        logger.debug(f'Encode {prompt} to {input_ids}')
-        return input_ids
-
-    def decode(self, value):
-        """Fallback when tokenizer is not fast."""
-
-        self.tokenizer: PreTrainedTokenizer
-
-        tok = self.tokenizer.decode(value)
-        return tok + ' '
-
-
-class BasicAdapterFast(BaseAdapter):
-    """Basic adapter for slow tokenizers."""
-
-    hex_regex = re.compile(r'^<0x([0-9ABCDEF]+)>$')
-
-    def encode_and_decorate(self, prompt, add_special_tokens=False):
-        """Encode prompt.
-
-        Note:
-            we leave <bos> to session manager to add.
-        """
-        input_ids = self.tokenizer.encode(
-            prompt,
-            add_special_tokens=add_special_tokens,
-            return_tensors='pt',
-        )
-        logger.debug(f'Encode {prompt} to {input_ids}')
-        return input_ids
-
-    def decode(self, value):
-        """Decode with fast tokenizers."""
-
-        self.tokenizer: PreTrainedTokenizerFast
-
-        tok = self.tokenizer._convert_id_to_token(value)
-        if tok.startswith('▁'):  # sentencepiece
-            space = ' '
-            tok = tok[1:]
-        else:
-            space = ''
-        if res := self.hex_regex.match(tok):
-            tok = chr(int(res.group(1), 16))
-        if tok == '</s>' or tok == '\r':
-            tok = '\n'
-
-        tok = space + tok
-
-        logger.debug(f'Decode {value} to {repr(tok)}')
-
-        return tok
--- a/lmdeploy/pytorch/adapters/internlm.py
+++ b/lmdeploy/pytorch/adapters/internlm.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import re
-
-import torch
-from transformers import (PreTrainedTokenizerFast, StoppingCriteria,
-                          StoppingCriteriaList)
-
-from .base import BaseAdapter
-
-logger = logging.getLogger(__name__)
-
-
-class InternLMStoppingCriteria(StoppingCriteria):
-    """Stopping criteria for HF version of InternLM."""
-
-    def __call__(self, input_ids, *args, **kwargs) -> bool:
-        return input_ids[0, -1] in [2, 103028]
-
-
-class InternLMAdapter(BaseAdapter):
-    """Adapter for InternLM.
-
-    InternLM use the following template and \n should be 13.
-
-        <bos> (no actual newline here, just for better readability)
-        <|User|>:{prompt}<eoh>\n
-        <|Bot|>:{model_output}<eoa>\n
-        <|User|>:{prompt}<eoh>\n
-        <|Bot|>:{model_output}<eoa>\n
-        ...
-        <eos>
-    """
-
-    hex_regex = re.compile(r'^<0x([0-9ABCDEF]+)>$')
-    # ids of '<|User|>:'
-    B_USER_ID = torch.tensor([[333, 352, 1621, 352, 27232]])
-    # ids of '<eoh>\n<|Bot|>:'
-    E_USER_ID = torch.tensor([[103027, 13, 333, 352, 23845, 352, 27232]])
-    # ids of '<bos>'
-    start_ids = [1]
-    # ids of '\n'
-    sep_ids = [13]
-
-    def __init__(self, tokenizer: PreTrainedTokenizerFast):
-        self.tokenizer = tokenizer
-
-    def encode_and_decorate(self, prompt):
-        r"""Encode prompt and decorate with template.
-
-        Note:
-            we leave <bos> and chat history for session manager to add,
-        so we will decorate input_ids to '<|User|>:{prompt}<eoh>\n<|Bot|>:'
-        """
-        input_ids = self.tokenizer.encode(
-            prompt,
-            add_special_tokens=False,
-            return_tensors='pt',
-        )
-        # This is f'<|User|>:{prompt}<eoh>\n<|Bot|>:'
-        # but force \n to 13 instead of 364
-        input_ids = torch.cat([self.B_USER_ID, input_ids, self.E_USER_ID],
-                              dim=1)
-        return input_ids
-
-    def decode(self, value):
-        """Decode generated tokens for InternLM."""
-
-        tok = self.tokenizer.decode(value)
-        if res := self.hex_regex.match(tok):
-            tok = chr(int(res.group(1), 16))
-        if tok == '</s>' or tok == '<eoa>' or tok == '\r':
-            tok = '\n'
-
-        logger.debug(f'Decode {value} to {repr(tok)}')
-
-        return tok
-
-    @property
-    def stopping_criteria(self):
-        return StoppingCriteriaList([InternLMStoppingCriteria()])
--- a/lmdeploy/pytorch/adapters/llama2.py
+++ b/lmdeploy/pytorch/adapters/llama2.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import re
-
-from transformers import PreTrainedTokenizerFast
-
-from .base import BasicAdapterFast
-
-logger = logging.getLogger(__name__)
-
-B_INST, E_INST = '[INST]', '[/INST]'
-B_SYS, E_SYS = '<<SYS>>\n', '\n<</SYS>>\n\n'
-DEFAULT_SYSTEM_PROMPT = """\
-You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
-
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""   # noqa: E501
-
-
-class Llama2Adapter(BasicAdapterFast):
-    """Adapter for llama2.
-
-    Llama2 use the following template and the first user prompt
-    should contain a system prompt.
-
-    User can specify the system prompt using a <<SYS>> tag otherwise
-    the default system prompt is prepended to user's input.
-
-        <bos>
-        [INST]<space>
-        <<SYS>>\n
-        SYSTEM_PROMPT\n
-        <</SYS>>\n\n
-        {user_prompt_1}<space>
-        [/INST]<space>
-        {answer_1}<space>
-        <eos>
-
-        <bos>
-        [INST]<space>
-        {user_prompt_2}<space>
-        [/INST]<space>
-        {answer_2}<space>
-        <eos>
-
-        <bos>
-        [INST]<space>
-        {user_prompt_2}(no space here)
-        ...
-    """
-
-    start_ids = []
-    sep_ids = []
-
-    def __init__(self, tokenizer: PreTrainedTokenizerFast):
-        super().__init__(tokenizer)
-        self.prev_round = 0
-
-    def encode_and_decorate(self, prompt):
-        r"""Encode prompt and decorate with template."""
-
-        if self.prev_round == 0:
-            res = re.search(r'<<SYS>>(.*?)<</SYS>>(.*)', prompt)
-            if res:
-                prompt = B_SYS + res.group(1).strip() + \
-                    E_SYS + res.group(2).strip()
-            else:
-                prompt = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + prompt
-
-        prompt = f'{B_INST} {prompt.strip()} {E_INST}'
-
-        logger.debug(f'decorated prompt: {repr(prompt)}')
-
-        input_ids = self.tokenizer.encode(
-            prompt,
-            add_special_tokens=True,
-            return_tensors='pt',
-        )
-
-        self.prev_round += 1
-        return input_ids
--- a/lmdeploy/pytorch/chat.py
+++ b/lmdeploy/pytorch/chat.py
 # Copyright (c) OpenMMLab. All rights reserved.
-"""Chat through command line.
-
-This submodule allows user to chat with language model through command line,
-and optionally accelerate model using backends like deepspeed.
-
-Example 1: Chat with default setting
-
-```python
-python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
-```
-
-Example 2: Disable sampling
-
-```python
-python -m lmdeploy.pytorch.chat \
-    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
-    --temperature 0
-```
-
-Example 3: Accelerate with deepspeed inference
-
-```python
-python -m lmdeploy.pytorch.chat \
-    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
-    --accel deepspeed
-```
-
-Note: to use deepspeed, you need to install deepspeed,
-    and if hope to accelerate InternLM, you need a customized version
-    https://github.com/wangruohui/DeepSpeed/tree/support_internlm_0.10.0
-
-Example 4: Tensor parallel the model on 2 GPUs
-
-```python
-deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
-    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
-    --accel deepspeed \
-```
-
-This module also allow the following control commands to change
-generation behaviors during chat.
-
- `exit`: terminate and exit chat
- `config set key=value`: change generation config `key` to `value`,
-    e.g. config temperature=0 disable sampling for following chats
- `clear`: clear chat history
-"""
-
-import itertools
-import logging
-from typing import Optional
-
-import torch
-from transformers import GenerationConfig, PreTrainedModel
-
-from .adapters import init_adapter
-from .dist import get_local_rank, get_rank, get_world_size
-from .model import accel_model, init_model
-from .session import BasicSessionManagerWithHistory
-from .utils import BasicStreamer, TerminalIO, control
-
-logger = logging.getLogger(__name__)
-
-
-def set_logging(log_file: str, debug: bool):
-    torch.set_printoptions(linewidth=120)
-    level = logging.DEBUG if debug else logging.INFO
-    log_file = log_file or 'chat.log'
-    if r := get_rank() != 0:
-        log_file = log_file + f'.{r}'
-    logging.basicConfig(level=level,
-                        format=('%(filename)s: '
-                                '%(levelname)s: '
-                                '%(funcName)s(): '
-                                '%(lineno)d:\t'
-                                '%(message)s'),
-                        filename=log_file,
-                        filemode='w')
-    print(f'Worker {get_rank()} logging to {log_file}')
-
-
-def main(
-    model_path: str,
-    tokenizer_path: Optional[str] = None,
-    accel: Optional[str] = None,
-    max_new_tokens: int = 128,
-    temperature: float = 0.8,
-    top_p: float = 0.95,
-    seed: int = 0,
-    use_fast_tokenizer: bool = True,
-    max_alloc: int = 2048,
-    max_session_len: int = None,
-    log_file: Optional[str] = None,
-    debug: bool = False,
-    adapter: Optional[str] = None,
-):
-    """Chat with model through terminal.
+
+import os
+import random
+from typing import List
+
+from lmdeploy.messages import EngineGenerationConfig, PytorchEngineConfig
+from lmdeploy.model import MODELS, best_match_model
+from lmdeploy.tokenizer import DetokenizeState, Tokenizer
+
+os.environ['TM_LOG_LEVEL'] = 'ERROR'
+
+
+def input_prompt(model_name):
+    """Input a prompt in the consolo interface."""
+    if model_name == 'codellama':
+        print('\nenter !! to end the input >>>\n', end='')
+        sentinel = '!!'
+    else:
+        print('\ndouble enter to end input >>> ', end='')
+        sentinel = ''  # ends when this string is seen
+    return '\n'.join(iter(input, sentinel))
+
+
+def valid_str(string, coding='utf-8'):
+    """decode text according to its encoding type."""
+    invalid_chars = [b'\xef\xbf\xbd']
+    bstr = bytes(string, coding)
+    for invalid_char in invalid_chars:
+        bstr = bstr.replace(invalid_char, b'')
+    ret = bstr.decode(encoding=coding, errors='ignore')
+    return ret
+
+
+def _stop_words(stop_words: List[str], tokenizer: Tokenizer):
+    """Return a list of token ids corresponding to stop-words."""
+    if stop_words is None:
+        return None
+    assert isinstance(stop_words, List) and \
+        all(isinstance(elem, str) for elem in stop_words), \
+        f'stop_words must be a list but got {type(stop_words)}'
+    stop_words = [
+        tokenizer.encode(stop_word, False)[-1] for stop_word in stop_words
+    ]
+    assert isinstance(stop_words, List) and all(
+        isinstance(elem, int) for elem in stop_words), 'invalid stop_words'
+    return stop_words
+
+
+def run_chat(model_path: str,
+             engine_config: PytorchEngineConfig,
+             gen_config: EngineGenerationConfig = None,
+             session_id: int = 1,
+             trust_remote_code: bool = True):
+    """An example to perform model inference through the command line
+    interface.

    Args:
-        model_path (str): Path to model.
-        tokenizer_path (str): Path to tokenizer.
-        accel (str): Model accelerator.
-        max_new_tokens (int): Maximum number of tokens to generate.
-        temperature (float): Temperature for sampling.
-        top_p (float): Top p for sampling.
-        seed (int): Random seed.
-        use_fast_tokenizer (bool): Whether to use fast tokenizer.
-            This argument is directly pass to transformer's ``AutoTokenizer.from_pretrained``.
-            Generally, user should choose to use fast tokenizers.
-            But if using fast raise some error, try to force using a slow one.
-        max_alloc (int): Maximum memory to allocate (for deepspeed).
-        max_session_len (int): Maximum number of tokens allowed for all chat sessions.
-            This include both history and current session.
-        log_file (str): Path to log file.
-        debug (bool): Whether to enable debug mode.
-        adapter (str): Force to use an adapter.
-            Generally user should not use this argument because adapter is selected based
-            on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
-            based on `LlamaforCausalLM` class, this argument is required.
-            Currently, only "llama1" is acceptable for llama1 models.
-    """  # noqa: E501
-    set_logging(log_file, debug)
-
-    # workers should sync in sampling
-    torch.manual_seed(seed)
-
-    local_rank = get_local_rank()
-    world_size = get_world_size()
-
-    # Init model and tokenizer
-    if not tokenizer_path:
-        tokenizer_path = model_path
-
-    model, tokenizer = init_model(
-        model_path,
-        tokenizer_path,
-        use_fast_tokenizer=use_fast_tokenizer,
-    )
-
-    # Init adapter based on model and tokenizer
-    adapter = init_adapter(model, tokenizer, adapter)
-
-    # Accelerate model
-    model: PreTrainedModel = accel_model(model,
-                                         accel,
-                                         max_alloc=max_alloc,
-                                         tp_size=world_size)
-
-    # warmup
-    warmup_config = GenerationConfig(
-        max_new_tokens=1,
-        do_sample=temperature > 0,
-        temperature=temperature,
-        top_p=top_p,
-    )
-    model.generate(torch.tensor([[6]], device=get_local_rank()), warmup_config)
-
-    gen_config = GenerationConfig(
-        max_new_tokens=max_new_tokens,
-        do_sample=temperature > 0,
-        temperature=temperature,
-        top_p=top_p,
-    )
-
-    # Session manager handling history
-    max_session_len = max_alloc if max_session_len is None else max_session_len
-    sm = BasicSessionManagerWithHistory(max_session_len=max_session_len,
-                                        start_ids=adapter.start_ids,
-                                        sep_ids=adapter.sep_ids)
-    io = TerminalIO()
-    streamer = BasicStreamer(adapter.decode, io.output)
-
-    for r in itertools.count(1):
-        # User input from IO
-        logger.info(f'Round {r}')
-
-        prompt: str = io.input()
-        logger.info(f'User input: {prompt}')
-
-        # Allow user to change config during runtime or exit
-        if control(prompt, gen_config, sm):
-            continue
-
-        # Tokenize and apply model specific templates
-        input_ids = adapter.encode_and_decorate(prompt)
-        logger.info(f'Input ids:\n{input_ids}')
-
-        # Prepend chat history (tensor concatenation)
-        input_ids = sm.prepend_history(input_ids)
-        logger.info(f'Input ids with history:\n{input_ids}')
-
-        # Generate
-        input_ids = input_ids.cuda(local_rank)
-        # returned tensor including input and generated output
-        output = model.generate(input_ids,
-                                gen_config,
-                                streamer=streamer,
-                                stopping_criteria=adapter.stopping_criteria)
-        logger.info(f'Output:\n{output}')
-
-        # Save output into session manager and maybe trim some history
-        sm.add_to_history(output)
-
-
-def cli():
-    import fire
+        model_path (str): the huggingface model path.
+        engine_config (PytorchEngineConfig): Config of engine.
+        gen_config (EngineGenerationConfig): Config of generation.
+        session_id (int): the identical id of a session.
+        trust_remote_code (bool): trust remote code.
+    """
+    from lmdeploy.pytorch.engine import Engine
+    tm_model = Engine.from_pretrained(model_path,
+                                      engine_config=engine_config,
+                                      trust_remote_code=trust_remote_code)
+    tokenizer = tm_model.tokenizer
+    generator = tm_model.create_instance()
+    adapter_name = None
+    if engine_config.adapters is not None:
+        adapter_name = next(iter(engine_config.adapters.keys()))
+
+    if gen_config is None:
+        gen_config = EngineGenerationConfig()
+
+    nth_round = 1
+    step = 0
+    seed = random.getrandbits(64)
+    model_name = engine_config.model_name
+    if model_name is None:
+        model_name = best_match_model(model_path)
+        assert model_name is not None, 'Can not find match model template'
+        print(f'match template: <{model_name}>')
+    model = MODELS.get(model_name)()
+    stop_words = _stop_words(model.stop_words, tokenizer)
+
+    while True:
+        prompt = input_prompt(model_name)
+        if prompt == 'exit':
+            exit(0)
+        elif prompt == 'end':
+            generator.end(session_id)
+            nth_round = 1
+            step = 0
+            seed = random.getrandbits(64)
+        else:
+            prompt = model.get_prompt(prompt, nth_round == 1)
+            input_ids = tokenizer.encode(prompt, nth_round == 1)
+            session_len = model.session_len
+            if session_len is None:
+                session_len = tm_model.session_len
+            if step >= session_len:
+                print('WARNING: exceed session max length.'
+                      ' Please end the session.')
+                continue
+
+            print(f'{prompt} ', end='', flush=True)
+            state = DetokenizeState()
+            gen_config.random_seed = seed
+            gen_config.stop_words = stop_words
+            for outputs in generator.stream_infer(session_id=session_id,
+                                                  input_ids=input_ids,
+                                                  gen_config=gen_config,
+                                                  adapter_name=adapter_name):
+                status, res, tokens = outputs
+                # decode res
+                response, state = tokenizer.detokenize_incrementally(
+                    res, state)
+                response = valid_str(response)
+                print(f'{response}', end='', flush=True)
+
+            # update step
+            step += len(input_ids) + tokens
+            print()
+
+            nth_round += 1
+
+
+def main(model_path: str,
+         model_name: str = None,
+         session_id: int = 1,
+         top_k: float = 40,
+         top_p: float = 0.8,
+         temperature: float = 0.8,
+         repetition_penalty: float = 1.0,
+         tp: int = 1,
+         stream_output: bool = True,
+         adapter: str = None,
+         trust_remote_code: bool = True):
+    """An example to perform model inference through the command line
+    interface.

-    fire.Fire(main)
+    Args:
+        model_path (str): the huggingface model path
+        model_name (str): name of the model.
+        session_id (int): the identical id of a session
+        top_k (int): sampling top k.
+        top_p (int): sampling top p.
+        temperature (float): sampling temperature.
+        repetition_penalty (float): parameter to penalize repetition
+        tp (int): GPU number used in tensor parallelism
+        stream_output (bool): indicator for streaming output or not
+        adapter (str): path to lora adapter.
+        trust_remote_code (bool): Trust remote code.
+    """
+    adapters = None
+    if adapter is not None:
+        adapters = dict(default=adapter)
+    engine_config = PytorchEngineConfig(model_name=model_name,
+                                        tp=tp,
+                                        adapters=adapters)
+    gen_config = EngineGenerationConfig(max_new_tokens=512,
+                                        top_k=top_k,
+                                        top_p=top_p,
+                                        temperature=temperature,
+                                        repetition_penalty=repetition_penalty,
+                                        ignore_eos=False)
+    return run_chat(model_path,
+                    engine_config,
+                    gen_config,
+                    session_id=session_id,
+                    trust_remote_code=trust_remote_code)


 if __name__ == '__main__':
-    cli()
+    import fire
+
+    fire.Fire(main)
--- a/lmdeploy/pytorch/decode.py
+++ b/lmdeploy/pytorch/decode.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import logging
-import queue
-import warnings
-from typing import List, Optional
-
-import pynvml
-import torch
-import torch.multiprocessing as mp
-from torch.nn.utils.rnn import pad_sequence
-from transformers import (AutoTokenizer, PreTrainedModel,
-                          PreTrainedTokenizerBase)
-
-from .model import accel_model, init_model
-
-
-def safe_numel(free_mem, model_size, max_intermediate):
-    """Number of elements without out-of-memory."""
-    return int(free_mem - model_size) // max_intermediate
-
-
-def avail_gpus(percentage=0.96):
-    """Detect available gpus.
-
-    Args:
-        percentage (float): The minimum percentage of free memory to be
-            considered as available.
-
-    Return:
-       A list of gpu ids.
-       average free memory on single gpu.
-    """
-
-    gpus = []
-    mems = []
-    pynvml.nvmlInit()
-    for i in range(torch.cuda.device_count()):
-        handle = pynvml.nvmlDeviceGetHandleByIndex(int(i))
-        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        free, total = int(mem_info.free), int(mem_info.total)
-
-        if free / total > percentage:
-            gpus.append(i)
-            mems.append(free)
-    pynvml.nvmlShutdown()
-
-    if len(gpus) == 0:
-        raise RuntimeError('No GPU available.')
-
-    return gpus, sum(mems) / len(mems)
-
-
-@torch.no_grad()
-def decode_single(model: PreTrainedModel,
-                  input_ids: torch.Tensor,
-                  attention_mask: torch.Tensor = None,
-                  return_logits=True):
-    """Decode a single batch.
-
-    Args:
-        model (PreTrainedModel): Pretrained model.
-        input_ids (torch.Tensor): A batch of input ids.
-        attention_mask (torch.Tensor): A batch of attention masks.
-
-    Returns:
-        torch.Tensor: A batch of probabilities (on CPU).
-
-
-    Note:
-        This function assume input_ids[i] = [bos, x1, x2, ..., xn]
-        and return prob = [p(x1|bos), p(x2|bos,x1), ..., p(xn|bos..xn-1)]
-        So prob is shorter than input_ids by 1.
-    """
-
-    # Call Causal LM forward
-    outputs = model(input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    output_hidden_states=False,
-                    output_attentions=False,
-                    use_cache=False,
-                    return_dict=True)
-    # fp32, [bs, seq_len, vocab_size]
-    logits = outputs.logits
-
-    if not return_logits:
-        # inplace softmax to get probs
-        torch.softmax(logits, dim=-1, out=logits)
-
-        # Shift to fetch probabilities
-        shift_labels = input_ids[..., 1:].contiguous()
-        shift_probs = logits[..., :-1, :].contiguous()
-        logits = torch.gather(shift_probs, -1, shift_labels.unsqueeze(-1))
-
-    if attention_mask is not None:
-        logits *= attention_mask[..., None]
-
-    logits = logits.cpu()
-
-    return logits
-
-
-def worker_fn(model_path: str,
-              inq: mp.Queue,
-              outq: mp.Queue,
-              accel: Optional[str] = None,
-              gpu_id=0):
-    # torch.set_default_device(gpu_id)
-    model, _ = init_model(model_path)
-    model = model.eval()
-    model = accel_model(model, accel, gpu_id=gpu_id)
-
-    while True:
-        try:
-            idx, args = inq.get(timeout=1)
-        except queue.Empty:
-            continue
-
-        if idx is None:
-            print(f'Worker {gpu_id} received exit signal.')
-            break
-
-        # print(args)
-        input_ids, input_lens, *args = args
-
-        input_ids = input_ids.cuda(gpu_id)
-        max_len = max(input_lens)
-        assert max_len == input_ids.size(-1), \
-            f'input_ids.shape = {input_ids.shape}, max_len = {max_len}'
-
-        input_lens = torch.tensor(input_lens, device=gpu_id)
-        attention_mask = \
-            torch.arange(max_len, device=gpu_id)[None, :] < input_lens[:, None]
-
-        assert attention_mask.shape == input_ids.shape, \
-            f'attention_mask.shape = {attention_mask.shape}'
-
-        try:
-            probs = decode_single(model, input_ids, attention_mask, *args)
-        except torch.cuda.OutOfMemoryError:
-            warnings.warn(
-                f'OOM on GPU {gpu_id}, discard prompts at indics {idx}.')
-            probs = torch.empty((input_ids.size(0), 0),
-                                dtype=torch.float32,
-                                device='cpu')
-
-        outq.put((idx, probs))
-
-    print(f'Exiting worker {gpu_id} ...')
-    inq.close()
-    outq.close()
-    print(f'Worker {gpu_id} finished.')
-
-
-class Engine:
-    """Multi-GPU deciding engine.
-
-    Args:
-        model_path (str): Path to the pretrained model.
-        tokenizer_path (str, optional): Path to the pretrained tokenizer.
-            Defaults to None.
-            Either tokenizer_path or tokenizer should be provided.
-        tokenizer (PreTrainedTokenizerBase, optional): Pre-configured tokenizer.
-            Defaults to None.
-            Either tokenizer_path or tokenizer should be provided.
-        accel (str, optional): Acceleration method.
-            Defaults to None. 'deepspeed' is not tested.
-        gpu_mem_percentage (float, optional): GPU with memory larger than this value
-            are considered available and be used as decode device.
-            Defaults to 0.96.
-        model_size_byte (float, optional): (Approximate) model size in bytes.
-            Defaults to 14e9 (7B model in FP16).
-        bytes_per_token (float, optional): (Approximate) memory cost per token in bytes.
-            Defaults to 2e6 (2MB).
-            ``bytes_per_token`` and ``model_size_byte`` are used to compute
-            the maximum batch size for given seq_length
-    """  # noqa: E501
-
-    def __init__(self,
-                 model_path: str,
-                 tokenizer_path: Optional[str] = None,
-                 tokenizer: Optional[PreTrainedTokenizerBase] = None,
-                 accel: Optional[str] = None,
-                 gpu_mem_percentage: float = 0.96,
-                 model_size_byte=14e9,
-                 bytes_per_token=2e6):
-
-        gpu_ids, mem = avail_gpus(gpu_mem_percentage)
-        print(f'Available GPUs are: {gpu_ids}, ', end='')
-        print(f'with {mem/2**30:.2f} GiB free.')
-
-        ctx = mp.get_context('spawn')
-        inq = ctx.Queue()
-        outq = ctx.Queue()
-
-        ps = []
-        for id in gpu_ids:
-            p = ctx.Process(target=worker_fn,
-                            args=(model_path, inq, outq, accel, id))
-            p.start()
-            ps.append(p)
-
-        if tokenizer is None:
-
-            if tokenizer_path is None:
-                tokenizer_path = model_path
-
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-
-        self.gpu_ids = gpu_ids
-        self.inq = inq
-        self.outq = outq
-        self.ps = ps
-        self.tokenizer = tokenizer
-        self.safe_numel = safe_numel(mem, model_size_byte, bytes_per_token)
-
-    def clear_queue(self):
-        for q in self.inq, self.outq:
-            while not q.empty():
-                q.get()
-
-    def decode(self,
-               token_ids: List[List[int]],
-               sort=True,
-               max_bs: int = 1024,
-               pad=True,
-               pad_token_id=2,
-               return_logits=True):
-        """Inference the model to compute probabilities.
-
-        Args:
-            token_ids (List[List[int]]): List of list of token ids.
-            sort (bool, optional): Internally sort the prompts by length to achieve better efficiency.
-                Defaults to True.
-                Note: orders of returned probabilities are always the same as the input.
-            max_bs (int, optional): Maximum batch size.
-                Defaults to 1024.
-            pad (bool, optional): Pad the prompts in every mini batch to the same length.
-                Defaults to True. Set to False to save memory.
-            return_logits (bool, optional): Return logits instead of probabilities.
-
-        Returns:
-            numpy.ndarray: Array of logits of shape [bsz, seqlen, vocab_size],
-                with prob=0 padded, if pad is True
-            List[numpy.ndarray]: List of logits without padding, if pad is False.
-
-        Note:
-            This function will accept input token_ids = [x0(=bos), x1, x2, ..., xn]
-            and compute prob = [p(x1|x0), p(x2|x0,x1), ..., p(xn|x0..xn-1)]
-            So prob is shorter than input_ids by 1.
-        """  # noqa: E501
-
-        self.clear_queue()
-
-        # sort to achieve better efficiency
-        if sort:
-            pids_and_indicis = sorted(enumerate(token_ids),
-                                      key=lambda i_and_x: len(i_and_x[1]))
-        else:
-            pids_and_indicis = list(enumerate(token_ids))
-
-        left = 0
-        bs = max_bs
-
-        while left < len(token_ids):
-
-            if not sort:
-                bs = max_bs
-
-            right = min(left + bs, len(token_ids))
-
-            # batch of prompts
-            sub_p_and_i = pids_and_indicis[left:right]
-            idx, sub_p = zip(*sub_p_and_i)
-
-            # batch of input_ids and attn_masks
-            # inputs = self.tokenizer(sub_p, return_tensors='pt', padding=True)
-            input_ids = [torch.tensor(p) for p in sub_p]
-            input_ids = pad_sequence(input_ids,
-                                     batch_first=True,
-                                     padding_value=pad_token_id)
-            input_lens = [len(p) for p in sub_p]
-
-            # Dynamic batch size based on safe memory
-            while input_ids.numel() > self.safe_numel:
-                if bs == 1:
-                    break
-                bs = max(1, round(bs / 1.5))
-                print(f'\nReduce bs to {bs} when seq len reaches '
-                      f'{input_ids.shape[-1]}')
-                idx = idx[:bs]
-                input_lens = input_lens[:bs]
-                input_ids = input_ids[:bs, :max(input_lens)]
-
-            # Send to worker
-            self.inq.put((idx, (input_ids, input_lens)))
-
-            left += bs
-
-            print(
-                f'Distributing prompts {right}/{len(token_ids)},'
-                f' {right/len(token_ids):.0%}',
-                end='\r')
-
-        print()
-
-        # Collect outputs from workers
-        all_probs = [None] * len(token_ids)
-        count = 0
-
-        while count < len(token_ids):
-            idx, probs = self.outq.get()
-            for i, p in zip(idx, probs):
-                assert all_probs[i] is None
-                all_probs[i] = p
-
-            count += len(idx)
-            print(
-                f'Decoding and collecting outputs '
-                f'{count}/{len(token_ids)}, '
-                f'{count/len(token_ids):.0%}',
-                end='\r')
-
-        print()
-
-        if pad:
-            all_probs = pad_sequence(all_probs, batch_first=True)
-            all_probs = all_probs.cpu().numpy()
-        else:
-            all_probs = [p.cpu().numpy() for p in all_probs]
-
-        return all_probs
-
-    def __del__(self):
-        print('Exiting engine ...')
-        for _ in self.ps:
-            self.inq.put((None, None))
-        for p in self.ps:
-            p.join(timeout=1)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_path',
-                        default='llama2/huggingface/llama-2-7b',
-                        help='Path to HugigngFace model and tokenizer.')
-    parser.add_argument(
-        '--test_path',
-        default='',
-        help='Path to text file, with each line containing a prompt.')
-    parser.add_argument(
-        '-p',
-        '--prompts',
-        nargs='*',
-        default=[
-            'I believe the meaning of life is to find your gift.',
-            'Simply put, the theory of relativity states that',
-            'Building a website can be done in 10 simple steps:'
-        ],
-        help="Prompt in command line, please quote \"\" every sentences, "
-        'surpassed by --test_path')
-    parser.add_argument('--min_len',
-                        default=1,
-                        help='Minimum length of prompts')
-    parser.add_argument('--save-to',
-                        default='decode.out',
-                        help='Save results to this file.')
-    args = parser.parse_args()
-
-    model_path = args.model_path
-    test_path = args.test_path
-    prompts = args.prompts
-
-    logger = logging.getLogger(__name__)
-    # logging.basicConfig(level=logging.DEBUG)
-
-    # Use test file preferentially
-    if test_path:
-        with open(test_path, 'r') as f:
-            prompts = f.readlines()
-
-    prompts = [p.strip() for p in prompts]
-
-    # Output infos
-    print(f'Model path: {model_path}')
-
-    def _format(ts, start, end):
-        if start < 0:
-            start += len(ts)
-        if end <= 0:
-            end += len(ts)
-        return '\n'.join(
-            (f'{i}\t{t}' for i, t in zip(range(start, end), ts[start:end])))
-
-    if len(prompts) > 10:
-        print('Prompts:\n' + _format(prompts, 0, 5) + '\n......\n' +
-              _format(prompts, -5, 0))
-    else:
-        print('Prompts:\n' + _format(prompts, 0, 0))
-
-    # Init Engine in backend
-    engine = Engine(model_path)
-
-    # Tokenize
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    tokenizer.pad_token_id = tokenizer.eos_token_id
-    tokenizer.padding_side = 'right'
-
-    input_ids = tokenizer(prompts, padding=False)
-    input_ids: List[List[int]] = input_ids.input_ids
-
-    # Filter out too short prompts
-    input_ids = [i for i in input_ids if len(i) >= args.min_len]
-    if len(input_ids) < len(prompts):
-        logger.warning(
-            f'Filtered out {len(prompts) - len(input_ids)} prompts, '
-            f'because they are shorter than {args.min_len}.')
-
-    # Decode
-    logits = engine.decode(input_ids)
-
-    print(f'logits.shape = {logits.shape}')
-    # Save to pth
-    print(f'Dumping results to = {args.save_to}')
-
-    torch.save(logits, args.save_to, pickle_protocol=4)
-
-    del engine