同步0.2.6代码

d7117b95 · zhouxiang · 5f83e392 · d7117b95 · d7117b95 · d7117b95
Commit d7117b95 authored Mar 22, 2024 by zhouxiang
20 changed files
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-from typing import Optional
+from typing import List, Literal, Optional, Union
+from .archs import autoget_backend_config, get_task
+from .messages import PytorchEngineConfig, TurbomindEngineConfig
+from .model import ChatTemplateConfig
 def pipeline(model_path: str,
             model_name: Optional[str] = None,
-             instance_num: int = 32,
+             backend_config: Optional[Union[TurbomindEngineConfig,
-             tp: int = 1,
+                                            PytorchEngineConfig]] = None,
+             chat_template_config: Optional[ChatTemplateConfig] = None,
             log_level='ERROR',
             **kwargs):
    """
@@ -21,38 +26,83 @@ def pipeline(model_path: str,
                    "InternLM/internlm-chat-20b-4bit",
                    "lmdeploy/llama2-chat-70b-4bit", etc.
                - iii) The model_id of a model hosted inside a model repo
-                    on huggingface.co, such as "InternLM/internlm-chat-7b",
+                    on huggingface.co, such as "internlm/internlm-chat-7b",
                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                    and so on.
        model_name (str): needed when model_path is a pytorch model on
-            huggingface.co, such as "InternLM/internlm-chat-7b",
+            huggingface.co, such as "internlm/internlm-chat-7b",
            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
-        instance_num (int): instance numbers to be created
+        backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
-        tp (int): tensor parallel
+            config instance. Default to None.
+        chat_template_config (ChatTemplateConfig): chat template configuration.
+            Default to None.
        log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
    Examples:
+        >>> # LLM
        >>> import lmdeploy
-        >>> pipe = lmdeploy.pipeline('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
+        >>> pipe = lmdeploy.pipeline('internlm/internlm-chat-7b')
        >>> response = pipe(['hi','say this is a test'])
        >>> print(response)
+        >>>
+        >>> # VLM
+        >>> from lmdeploy.vl import load_image
+        >>> from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
+        >>> pipe = pipeline('liuhaotian/llava-v1.5-7b',
+        ...                 backend_config=TurbomindEngineConfig(session_len=8192),
+        ...                 chat_template_config=ChatTemplateConfig(model_name='vicuna'))
+        >>> im = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
+        >>> response = pipe([('describe this image', [im])])
+        >>> print(response)
    """ # noqa E501
-    from lmdeploy.serve.async_engine import AsyncEngine
+    if os.getenv('TM_LOG_LEVEL') is None:
-    os.environ['TM_LOG_LEVEL'] = log_level
+        os.environ['TM_LOG_LEVEL'] = log_level
-    return AsyncEngine(model_path,
+    from lmdeploy.utils import get_logger
-                       model_name=model_name,
+    logger = get_logger('lmdeploy')
-                       instance_num=instance_num,
+    logger.setLevel(log_level)
-                       tp=tp,
-                       **kwargs)
+    pipeline_type, pipeline_class = get_task(model_path)
+    if pipeline_type == 'vlm':
+        assert (type(backend_config) is TurbomindEngineConfig) or \
+            (backend_config is None), \
+            f'{pipeline_type} model only support turbomind backend.'
+    if pipeline_type == 'llm' and type(
+            backend_config) is not PytorchEngineConfig:
+        # set auto backend mode
+        backend_config = autoget_backend_config(model_path, backend_config)
+    backend = 'pytorch' if type(
+        backend_config) is PytorchEngineConfig else 'turbomind'
+    logger.info(f'Using {backend} engine')
+    if 'tp' in kwargs:
+        logger.warning(
+            'The argument "tp" is deprecated and will be removed soon. '
+            'Please set "tp" in "backend_config"')
+        tp = kwargs['tp']
+        kwargs.pop('tp')
+    else:
+        tp = 1 if backend_config is None else backend_config.tp
+    return pipeline_class(model_path,
+                          model_name=model_name,
+                          backend=backend,
+                          backend_config=backend_config,
+                          chat_template_config=chat_template_config,
+                          tp=tp,
+                          **kwargs)
 def serve(model_path: str,
          model_name: Optional[str] = None,
+          backend: Literal['turbomind', 'pytorch'] = 'turbomind',
+          backend_config: Optional[Union[TurbomindEngineConfig,
+                                         PytorchEngineConfig]] = None,
+          chat_template_config: Optional[ChatTemplateConfig] = None,
          server_name: str = '0.0.0.0',
          server_port: int = 23333,
-          instance_num: int = 64,
-          tp: int = 1,
          log_level: str = 'ERROR',
+          api_keys: Optional[Union[List[str], str]] = None,
+          ssl: bool = False,
          **kwargs):
    """This will run the api_server in a subprocess.
@@ -67,24 +117,31 @@ def serve(model_path: str,
                    "InternLM/internlm-chat-20b-4bit",
                    "lmdeploy/llama2-chat-70b-4bit", etc.
                - iii) The model_id of a model hosted inside a model repo
-                    on huggingface.co, such as "InternLM/internlm-chat-7b",
+                    on huggingface.co, such as "internlm/internlm-chat-7b",
                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                    and so on.
        model_name (str): needed when model_path is a pytorch model on
-            huggingface.co, such as "InternLM/internlm-chat-7b",
+            huggingface.co, such as "internlm/internlm-chat-7b",
            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
+        backend (str): either `turbomind` or `pytorch` backend. Default to
+            `turbomind` backend.
+        backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
+            config instance. Default to none.
+        chat_template_config (ChatTemplateConfig): chat template configuration.
+            Default to None.
        server_name (str): host ip for serving
        server_port (int): server port
-        instance_num (int): number of instances of turbomind model
-        tp (int): tensor parallel
        log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
+        api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as
+            a single api_key. Default to None, which means no api key applied.
+        ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.
    Return:
        APIClient: A client chatbot for LLaMA series models.
    Examples:
        >>> import lmdeploy
-        >>> client = lmdeploy.serve('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
+        >>> client = lmdeploy.serve('internlm/internlm-chat-7b', 'internlm-chat-7b')
        >>> for output in client.chat('hi', 1):
        ...    print(output)
    """ # noqa E501
@@ -93,33 +150,57 @@ def serve(model_path: str,
    from lmdeploy.serve.openai.api_client import APIClient
    from lmdeploy.serve.openai.api_server import serve
+    if type(backend_config) is not PytorchEngineConfig:
+        # set auto backend mode
+        backend_config = autoget_backend_config(model_path, backend_config)
+    backend = 'pytorch' if type(
+        backend_config) is PytorchEngineConfig else 'turbomind'
+    if 'tp' in kwargs:
+        tp = kwargs['tp']
+        kwargs.pop('tp')
+    else:
+        tp = 1 if backend_config is None else backend_config.tp
    task = Process(target=serve,
                   args=(model_path, ),
                   kwargs=dict(model_name=model_name,
+                               backend=backend,
+                               backend_config=backend_config,
+                               chat_template_config=chat_template_config,
                               server_name=server_name,
                               server_port=server_port,
-                               instance_num=instance_num,
                               tp=tp,
                               log_level=log_level,
-                               **kwargs))
+                               api_keys=api_keys,
+                               ssl=ssl,
+                               **kwargs),
+                   daemon=True)
    task.start()
    client = APIClient(f'http://{server_name}:{server_port}')
    while True:
        time.sleep(1)
        try:
            client.available_models
+            print(
+                f'Launched the api_server in process {task.pid}, user can '
+                f'kill the server by:\nimport os,signal\nos.kill({task.pid}, '
+                'signal.SIGKILL)')
            return client
        except:  # noqa
            pass
-def client(api_server_url: str = 'http://0.0.0.0:23333', **kwargs):
+def client(api_server_url: str = 'http://0.0.0.0:23333',
+           api_key: Optional[str] = None,
+           **kwargs):
    """
    Args:
        api_server_url (str): communicating address 'http://<ip>:<port>' of
            api_server
+        api_key (str | None): api key. Default to None, which means no
+            api key will be used.
    Return:
        Chatbot for LLaMA series models with turbomind as inference engine.
    """
    from lmdeploy.serve.openai.api_client import APIClient
-    return APIClient(api_server_url, **kwargs)
+    return APIClient(api_server_url, api_key, **kwargs)
--- a/lmdeploy/cli/__init__.py
+++ b/lmdeploy/cli/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from .cli import run
+from .entrypoint import run
 __all__ = ['run']
--- a/lmdeploy/cli/chat.py
+++ b/lmdeploy/cli/chat.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
+from .cli import CLI
+from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter,
+                    convert_args, get_lora_adapters)
 class SubCliChat(object):
-    """Chat through terminal with pytorch or turbomind model."""
+    _help = 'Chat with pytorch or turbomind engine.'
+    _desc = _help
+    parser = CLI.subparsers.add_parser('chat', help=_help, description=_desc)
+    subparsers = parser.add_subparsers(
+        title='Commands', description='This group has the following commands:')
-    def torch(self,
+    @staticmethod
-              model_path: str,
+    def add_parser_torch():
-              tokenizer_path: Optional[str] = None,
+        """Add parser for torch command."""
-              accel: Optional[str] = None,
+        parser = SubCliChat.subparsers.add_parser(
-              max_new_tokens: int = 128,
+            'torch',
-              temperature: float = 0.8,
+            formatter_class=DefaultsAndTypesHelpFormatter,
-              top_p: float = 0.95,
+            help=SubCliChat.torch.__doc__,
-              seed: int = 0,
+            description=SubCliChat.torch.__doc__,
-              use_fast_tokenizer: bool = True,
+        )
-              max_alloc: int = 2048,
+        parser.set_defaults(run=SubCliChat.torch)
-              max_session_len: int = None,
+        parser.add_argument('model_path',
-              log_file: Optional[str] = None,
+                            type=str,
-              debug: bool = False,
+                            help='The huggingface model path')
-              adapter: Optional[str] = None):
+        # engine args
-        """Chat with pytorch model through terminal.
+        engine_group = parser.add_argument_group('Engine arguments')
+        ArgumentHelper.model_name(engine_group)
+        ArgumentHelper.tp(engine_group)
+        ArgumentHelper.session_len(engine_group)
+        ArgumentHelper.adapters(engine_group)
+        ArgumentHelper.cache_max_entry_count(engine_group)
-        Args:
+        # other args
-            model_path (str): Path to pytorch model.
+        parser.add_argument('--trust-remote-code',
-            tokenizer_path (str): Path to tokenizer.
+                            action='store_false',
-            accel (str): Model accelerator.
+                            default=True,
-            max_new_tokens (int): Maximum number of tokens to generate.
+                            help='Trust remote code')
-            temperature (float): Temperature for sampling.
-            top_p (float): Top p for sampling.
-            seed (int): Random seed.
-            use_fast_tokenizer (bool): Whether to use fast tokenizer.
-                This argument is directly pass to transformer's
-                ``AutoTokenizer.from_pretrained``.
-                Generally, user should choose to use fast tokenizers.
-                But if using fast raise some error, try to force using a slow one.
-            max_alloc (int): Maximum memory to allocate (for deepspeed).
-            max_session_len (int): Maximum number of tokens allowed for all chat sessions.
-                This include both history and current session.
-            log_file (str): Path to log file.
-            debug (bool): Whether to enable debug mode.
-            adapter (str): Force to use an adapter.
-                Generally user should not use this argument because adapter is selected based
-                on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
-                based on `LlamaforCausalLM` class, this argument is required.
-                Currently, only "llama1" is acceptable for llama1 models.
-        """  # noqa: E501
-        from lmdeploy.pytorch.chat import main as run_torch_model
-        run_torch_model(model_path,
+    @staticmethod
-                        tokenizer_path=tokenizer_path,
+    def add_parser_turbomind():
-                        accel=accel,
+        """Add parser for turbomind command."""
-                        max_new_tokens=max_new_tokens,
+        parser = SubCliChat.subparsers.add_parser(
-                        temperature=temperature,
+            'turbomind',
-                        top_p=top_p,
+            formatter_class=DefaultsAndTypesHelpFormatter,
-                        seed=seed,
+            help=SubCliChat.turbomind.__doc__,
-                        use_fast_tokenizer=use_fast_tokenizer,
+            description=SubCliChat.turbomind.__doc__,
-                        max_alloc=max_alloc,
+        )
-                        max_session_len=max_session_len,
+        parser.set_defaults(run=SubCliChat.turbomind)
-                        log_file=log_file,
+        parser.add_argument(
-                        debug=debug,
+            'model_path',
-                        adapter=adapter)
+            type=str,
+            help='The path of the deployed model. '
+            'It can be in format of huggingface or turbomind. '
+            'When it is turbomind model, all arguments for engine'
+            'config would be ignored, so you need to change the `config.ini`')
+        # engine arguments
+        engine_group = parser.add_argument_group('Engine arguments')
+        ArgumentHelper.tp(engine_group)
+        ArgumentHelper.model_format(engine_group)
+        ArgumentHelper.quant_policy(engine_group)
+        ArgumentHelper.model_name(engine_group)
+        ArgumentHelper.cache_max_entry_count(engine_group)
+        ArgumentHelper.rope_scaling_factor(engine_group)
+        ArgumentHelper.session_len(engine_group)
+        # other arguments
+        ArgumentHelper.cap(parser)
+        ArgumentHelper.meta_instruction(parser)  # TODO remove
+        ArgumentHelper.chat_template(parser)
-    def turbomind(self,
+    @staticmethod
-                  model_path,
+    def torch(args):
-                  session_id: int = 1,
+        """Chat with PyTorch inference engine through terminal."""
-                  cap: str = 'chat',
+        from lmdeploy.messages import PytorchEngineConfig
-                  tp=1,
+        from lmdeploy.pytorch.chat import run_chat
-                  stream_output=True,
+        adapters = get_lora_adapters(args.adapters)
-                  **kwargs):
+        engine_config = PytorchEngineConfig(
-        """Chat with turbomind model through terminal.
+            model_name=args.model_name,
+            tp=args.tp,
+            session_len=args.session_len,
+            cache_max_entry_count=args.cache_max_entry_count,
+            adapters=adapters)
+        run_chat(args.model_path,
+                 engine_config,
+                 trust_remote_code=args.trust_remote_code)
-        Args:
+    @staticmethod
-            model_path (str): the path of the deployed model
+    def turbomind(args):
-            session_id (int): the identical id of a session
+        """Chat with TurboMind inference engine through terminal."""
-            cap (str): the capability of a model. For example, codellama has
+        from lmdeploy.turbomind.chat import main
-                the ability among ['completion', 'infilling', 'chat', 'python']
+        kwargs = convert_args(args)
-            tp (int): GPU number used in tensor parallelism
+        from lmdeploy.model import ChatTemplateConfig
-            stream_output (bool): indicator for streaming output or not
+        chat_template_config = ChatTemplateConfig(
-            **kwarg (dict): other arguments for initializing model's chat
+            model_name=args.model_name,
-                template
+            meta_instruction=args.meta_instruction,
-        """
+            capability=args.cap)
-        from lmdeploy.turbomind.chat import main as run_turbomind_model
+        if args.chat_template:
+            chat_template_config = ChatTemplateConfig.from_json(
+                args.chat_template)
+        kwargs.update(dict(chat_template_cfg=chat_template_config))
+        kwargs.pop('chat_template', None)
+        main(**kwargs)
-        run_turbomind_model(model_path,
+    @staticmethod
-                            session_id=session_id,
+    def add_parsers():
-                            cap=cap,
+        """Add all parsers."""
-                            tp=tp,
+        SubCliChat.add_parser_torch()
-                            stream_output=stream_output,
+        SubCliChat.add_parser_turbomind()
-                            **kwargs)
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import argparse
 import os
-import fire
+from ..version import __version__
+from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
-from .chat import SubCliChat
-from .lite import SubCliLite
-from .serve import SubCliServe
 class CLI(object):
-    """LMDeploy Command Line Interface.
+    _desc = 'The CLI provides a unified API for converting, ' \
+            'compressing and deploying large language models.'
-    The CLI provides a unified API for converting, compressing and deploying
+    parser = argparse.ArgumentParser(prog='lmdeploy',
-    large language models.
+                                     description=_desc,
-    """
+                                     add_help=True)
+    parser.add_argument('-v',
-    def convert(self,
+                        '--version',
-                model_name: str,
+                        action='version',
-                model_path: str,
+                        version=__version__)
-                model_format: str = None,
+    subparsers = parser.add_subparsers(
-                tokenizer_path: str = None,
+        title='Commands',
-                dst_path: str = './workspace',
+        description='lmdeploy has following commands:',
-                tp: int = 1,
+        dest='command')
-                quant_path: str = None,
-                group_size: int = 0,
+    @staticmethod
-                **kwargs):
+    def add_parser_convert():
-        """Convert LLMs to lmdeploy format.
+        """Add parser for convert command."""
+        parser = CLI.subparsers.add_parser(
-        Args:
+            'convert',
-            model_name (str): The name of the to-be-deployed model, such as
+            formatter_class=DefaultsAndTypesHelpFormatter,
-                llama-7b, llama-13b, vicuna-7b and etc.
+            description=CLI.convert.__doc__,
-            model_path (str): The directory path of the model or huggingface
+            help=CLI.convert.__doc__)
-                repo_id like 'internlm/internlm-chat-20b'
+        # define arguments
-            model_format (str): the format of the model, should choose from
+        parser.add_argument(
-                ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
+            'model_name',
-                format, 'hf' means huggingface llama format, and 'awq' means
+            type=str,
-                llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
+            help='The name of the to-be-deployed model, such as llama-7b, '
-                the default value is None, which means the model_format will be
+            'llama-13b, vicuna-7b and etc. You can run `lmdeploy list` to '
-                inferred based on model_name
+            'get the supported model names')
-            tokenizer_path (str): The path of tokenizer model.
+        parser.add_argument('model_path',
-            dst_path (str): The destination path that saves outputs.
+                            type=str,
-            tp (int): The number of GPUs used for tensor parallelism, which
+                            help='The directory path of the model')
-                should be 2^n.
+        ArgumentHelper.model_format(parser)
-            quant_path (str): Path of the quantized model, which can be None.
+        ArgumentHelper.tp(parser)
-            group_size (int): A parameter used in AWQ to quantize fp16 weights
+        # other args
-                to 4 bits.
+        parser.add_argument('--tokenizer-path',
-            kwargs (dict): other params for convert
+                            type=str,
-        """
+                            default=None,
-        from lmdeploy.turbomind.deploy.converter import main as convert
+                            help='The path of tokenizer model')
+        parser.add_argument('--dst-path',
-        convert(model_name,
+                            type=str,
-                model_path,
+                            default='workspace',
-                model_format=model_format,
+                            help='The destination path that saves outputs')
-                tokenizer_path=tokenizer_path,
+        parser.add_argument(
-                dst_path=dst_path,
+            '--quant-path',
-                tp=tp,
+            type=str,
-                quant_path=quant_path,
+            default=None,
-                group_size=group_size,
+            help='Path of the quantized model, which can be none')
-                **kwargs)
+        parser.add_argument(
+            '--group-size',
-    def list(self, engine: str = 'turbomind'):
+            type=int,
-        """List supported model names.
+            default=0,
+            help='A parameter used in awq to quantize fp16 weights '
-        Examples 1:
+            'to 4 bits')
-            lmdeploy list
+        parser.set_defaults(run=CLI.convert)
-        Examples 2:
-            lmdeploy list --engine pytorch
+    @staticmethod
+    def add_parser_list():
-        Args:
+        """Add parser for list command."""
-            engine (str): The backend for the model to run. Choice from
+        parser = CLI.subparsers.add_parser(
-                ['turbomind', 'pytorch'].
+            'list',
-        """
+            formatter_class=DefaultsAndTypesHelpFormatter,
-        assert engine in ['turbomind', 'pytorch']
+            description=CLI.list.__doc__,
-        if engine == 'pytorch':
+            help=CLI.list.__doc__)
-            model_names = ['llama', 'llama2', 'internlm-7b']
+        parser.set_defaults(run=CLI.list)
-        elif engine == 'turbomind':
+        # define arguments
-            from lmdeploy.model import MODELS
+        ArgumentHelper.engine(parser)
-            model_names = list(MODELS.module_dict.keys())
-            model_names = [n for n in model_names if n.lower() not in ['base']]
+    @staticmethod
+    def add_parser_checkenv():
+        """Add parser for check_env command."""
+        parser = CLI.subparsers.add_parser(
+            'check_env',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=CLI.check_env.__doc__,
+            help=CLI.check_env.__doc__)
+        parser.set_defaults(run=CLI.check_env)
+        parser.add_argument('--dump-file',
+                            type=str,
+                            default=None,
+                            help='The file path to save env info. Only '
+                            'support file format in `json`, `yml`,'
+                            ' `pkl`')
+    @staticmethod
+    def convert(args):
+        """Convert LLMs to turbomind format."""
+        from lmdeploy.turbomind.deploy.converter import main
+        kwargs = convert_args(args)
+        main(**kwargs)
+    @staticmethod
+    def list(args):
+        """List the supported model names."""
+        from lmdeploy.model import MODELS
+        model_names = list(MODELS.module_dict.keys())
+        deprecate_names = [
+            'baichuan-7b', 'baichuan2-7b', 'chatglm2-6b', 'internlm-chat-20b',
+            'internlm-chat-7b', 'internlm-chat-7b-8k', 'internlm2-1_8b',
+            'internlm-20b', 'internlm2-20b', 'internlm2-7b', 'internlm2-chat',
+            'internlm2-chat-1_8b', 'internlm2-chat-20b', 'internlm2-chat-7b',
+            'llama-2-chat', 'llama-2', 'qwen-14b', 'qwen-7b', 'solar-70b',
+            'yi-200k', 'yi-34b', 'yi-chat', 'Mistral-7B-Instruct',
+            'Mixtral-8x7B-Instruct', 'baichuan-base', 'deepseek-chat',
+            'internlm-chat'
+        ]
+        model_names = [
+            n for n in model_names if n not in deprecate_names + ['base']
+        ]
+        deprecate_names.sort()
        model_names.sort()
-        print('Supported model names:')
+        print('The older chat template name like "internlm2-7b", "qwen-7b"'
+              ' and so on are deprecated and will be removed in the future.'
+              ' The supported chat template names are:')
        print('\n'.join(model_names))
-    def check_env(self, dump_file: str = None):
+    @staticmethod
-        """Check env information.
+    def check_env(args):
+        """Check the environmental information."""
-        Args:
-            dump_file (str): Output file to save env info.
-        """
        import importlib
        import mmengine
@@ -121,19 +158,16 @@ class CLI(object):
            print(f'{k}: {v}')
        # dump to local file
+        dump_file = args.dump_file
        if dump_file is not None:
            work_dir, _ = os.path.split(dump_file)
            if work_dir:
                os.makedirs(work_dir, exist_ok=True)
            mmengine.dump(env_info, dump_file)
+    @staticmethod
-def run():
+    def add_parsers():
-    """The entry point of running LMDeploy CLI."""
+        """Add all parsers."""
+        CLI.add_parser_convert()
-    cli = CLI()
+        CLI.add_parser_list()
-    cli.lite = SubCliLite()
+        CLI.add_parser_checkenv()
-    cli.chat = SubCliChat()
-    cli.serve = SubCliServe()
-    fire.Fire(cli, name='lmdeploy')
--- a/lmdeploy/cli/lite.py
+++ b/lmdeploy/cli/lite.py
 # Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import DictAction
+from .cli import CLI
+from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
 class SubCliLite(object):
    """CLI for compressing LLMs."""
+    _help = 'Compressing and accelerating LLMs with lmdeploy.lite module'
+    _desc = _help
+    parser = CLI.subparsers.add_parser(
+        'lite',
+        help=_help,
+        description=_desc,
+    )
+    subparsers = parser.add_subparsers(
+        title='Commands', description='This group has the following commands:')
-    def auto_awq(self,
+    @staticmethod
-                 model: str,
+    def add_parser_auto_awq():
-                 work_dir: str,
+        """Add parser for auto_awq command."""
-                 w_bits: int = 4,
+        parser = SubCliLite.subparsers.add_parser(
-                 w_sym: bool = False,
+            'auto_awq',
-                 w_group_size: int = 128,
+            formatter_class=DefaultsAndTypesHelpFormatter,
-                 device: str = 'cuda'):
+            description=SubCliLite.auto_awq.__doc__,
-        """Perform weight quantization using AWQ algorithm.
+            help=SubCliLite.auto_awq.__doc__)
+        parser.set_defaults(run=SubCliLite.auto_awq)
+        parser.add_argument('model',
+                            type=str,
+                            help='The path of model in hf format')
+        ArgumentHelper.work_dir(parser)
+        ArgumentHelper.calib_dataset(parser)
+        ArgumentHelper.calib_samples(parser)
+        ArgumentHelper.calib_seqlen(parser)
+        ArgumentHelper.device(parser)
+        parser.add_argument('--w-bits',
+                            type=int,
+                            default=4,
+                            help='Bit number for weight quantization')
+        parser.add_argument('--w-sym',
+                            action='store_true',
+                            help='Whether to do symmetric quantization')
+        parser.add_argument(
+            '--w-group-size',
+            type=int,
+            default=128,
+            help='Group size for weight quantization statistics')
-        Args:
+    @staticmethod
-            model (str): The path of model in hf format.
+    def add_parser_calibrate():
-            work_dir (str): The working directory to save results.
+        """Add parser for calibrate command."""
-            w_bits (int): Bit number for weight quantization.
+        parser = SubCliLite.subparsers.add_parser(
-            w_sym (bool): Whether to do symmetric quantization.
+            'calibrate',
-            w_group_size (int): Group size for weight quantization statistics.
+            formatter_class=DefaultsAndTypesHelpFormatter,
-            device (str): Device type of running.
+            description=SubCliLite.calibrate.__doc__,
-        """
+            help=SubCliLite.calibrate.__doc__)
-        from lmdeploy.lite.apis.auto_awq import auto_awq
+        parser.set_defaults(run=SubCliLite.calibrate)
+        parser.add_argument('model',
+                            type=str,
+                            help='The name or path of the model to be loaded')
+        ArgumentHelper.work_dir(parser)
+        ArgumentHelper.calib_dataset(parser)
+        ArgumentHelper.calib_samples(parser)
+        ArgumentHelper.calib_seqlen(parser)
+        ArgumentHelper.device(parser)
-        auto_awq(model,
+    @staticmethod
-                 work_dir,
+    def add_parser_smooth_quant():
-                 w_bits=w_bits,
+        """Add parser for smooth_quant command."""
-                 w_sym=w_sym,
+        parser = SubCliLite.subparsers.add_parser(
-                 w_group_size=w_group_size,
+            'smooth_quant',
-                 device=device)
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliLite.smooth_quant.__doc__,
+            help=SubCliLite.smooth_quant.__doc__)
+        parser.set_defaults(run=SubCliLite.smooth_quant)
+        parser.add_argument('model',
+                            type=str,
+                            help='The name or path of the model to be loaded')
+        parser.add_argument(
+            '--work-dir',
+            type=str,
+            default='./work_dir',
+            help='The working directory for outputs. defaults to "./work_dir"')
+        ArgumentHelper.calib_dataset(parser)
+        ArgumentHelper.calib_samples(parser)
+        ArgumentHelper.calib_seqlen(parser)
+        ArgumentHelper.device(parser)
-    def calibrate(self,
+    @staticmethod
-                  model: str,
+    def add_parser_kv_qparams():
-                  calib_dataset: str = 'c4',
+        """Add parser for kv_qparams command."""
-                  calib_samples: int = 128,
+        parser = SubCliLite.subparsers.add_parser(
-                  calib_seqlen: int = 2048,
+            'kv_qparams',
-                  work_dir: str = './work_dir',
+            formatter_class=DefaultsAndTypesHelpFormatter,
-                  device: str = 'cuda') -> None:
+            description=SubCliLite.kv_qparams.__doc__,
-        """Perform calibration on a given dataset.
+            help=SubCliLite.kv_qparams.__doc__)
+        parser.set_defaults(run=SubCliLite.kv_qparams)
-        Args:
+        parser.add_argument('work_dir',
-            model (str): The model to be loaded.
+                            type=str,
-            calib_dataset (str, optional): The calibration dataset name.
+                            help='Directory path where the stats are saved')
-                Defaults to 'c4'.
+        parser.add_argument('turbomind_dir',
-            calib_samples (int, optional): The number of samples for
+                            type=str,
-                calibration. Defaults to 128.
+                            help='Directory path where to save the results')
-            calib_seqlen (int, optional): The sequence length for calibration.
+        parser.add_argument('--kv-bits',
-                Defaults to 2048.
+                            type=int,
-            work_dir (str): The working directory for outputs.
+                            default=8,
-                Defaults to './work_dir'.
+                            help='Number of bits for quantization')
-            device (str, optional): The device to be used for calculation.
+        parser.add_argument('--kv-sym',
-                Defaults to 'cuda'.
+                            action='store_true',
-        """
+                            help='Whether to use symmetric quantizaiton')
-        from lmdeploy.lite.apis.calibrate import calibrate
+        parser.add_argument(
+            '--num-tp',
+            type=int,
+            default=None,
+            help='GPU number used in tensor parallelism. Should be 2^n')
+        parser.add_argument('--tm-params',
+                            nargs='*',
+                            default=None,
+                            action=DictAction,
+                            help='Used key-values pairs in xxx=yyy format'
+                            ' to update the turbomind model weights'
+                            ' config')
-        calibrate(model,
+    @staticmethod
-                  calib_dataset=calib_dataset,
+    def auto_awq(args):
-                  calib_samples=calib_samples,
+        """Perform weight quantization using AWQ algorithm."""
-                  calib_seqlen=calib_seqlen,
+        from lmdeploy.lite.apis.auto_awq import auto_awq
-                  work_dir=work_dir,
+        kwargs = convert_args(args)
-                  device=device)
+        auto_awq(**kwargs)
-    def kv_qparams(self,
+    @staticmethod
-                   work_dir: str,
+    def calibrate(args):
-                   turbomind_dir: str,
+        """Perform calibration on a given dataset."""
-                   kv_bits: int = 8,
+        from lmdeploy.lite.apis.calibrate import calibrate
-                   kv_sym: bool = False,
+        kwargs = convert_args(args)
-                   num_tp: int = 1) -> None:
+        calibrate(**kwargs)
-        """Export key and value stats.
-        Args:
+    @staticmethod
-            work_dir (str): Directory path where the stats
+    def kv_qparams(args):
-                are saved.
+        """Export key and value stats."""
-            turbomind_dir (str): Directory path where to
-                save the results.
-            kv_bits (int, optional): Number of bits for quantization.
-                Defaults to 8.
-            kv_sym (bool, optional): Whether to use symmetric quantization.
-                Defaults to False.
-            num_tp (int, optional): Number of tensor parallelism.
-                Defaults to 1.
-        """
        from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
+        kwargs = convert_args(args)
+        run_kv_qparams(**kwargs)
-        run_kv_qparams(work_dir,
+    @staticmethod
-                       turbomind_dir,
+    def smooth_quant(args):
-                       kv_bits=kv_bits,
+        """Perform w8a8 quantization using SmoothQuant."""
-                       kv_sym=kv_sym,
+        from lmdeploy.lite.apis.smooth_quant import smooth_quant
-                       num_tp=num_tp)
+        kwargs = convert_args(args)
+        smooth_quant(**kwargs)
-    def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
-        """Convert a hugging face model to the smallest sharded one.
-        Args:
+    @staticmethod
-            src_dir (str): The directory of the input HF model.
+    def add_parsers():
-            dst_dir (str): The directory to save new  model.
+        """Add all parsers."""
-        """
+        SubCliLite.add_parser_auto_awq()
-        from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded
+        SubCliLite.add_parser_calibrate()
-        run_sharded(src_dir, dst_dir)
+        SubCliLite.add_parser_kv_qparams()
+        SubCliLite.add_parser_smooth_quant()
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Optional
+from .cli import CLI
+from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
-class SubCliServe(object):
+class SubCliServe:
    """Serve LLMs and interact on terminal or web UI."""
+    _help = 'Serve LLMs with gradio, openai API or triton server.'
+    _desc = _help
+    parser = CLI.subparsers.add_parser(
+        'serve',
+        help=_help,
+        description=_desc,
+    )
+    subparsers = parser.add_subparsers(
+        title='Commands', description='This group has the following commands:')
-    def gradio(self,
+    @staticmethod
-               model_path_or_server: str,
+    def add_parser_gradio():
-               server_name: str = '0.0.0.0',
+        """Add parser for gradio command."""
-               server_port: int = 6006,
+        parser = SubCliServe.subparsers.add_parser(
-               batch_size: int = 32,
+            'gradio',
-               tp: int = 1,
+            formatter_class=DefaultsAndTypesHelpFormatter,
-               **kwargs):
+            description=SubCliServe.gradio.__doc__,
-        """Serve LLMs with web ui using gradio.
+            help=SubCliServe.gradio.__doc__)
+        parser.set_defaults(run=SubCliServe.gradio)
-        Example 1:
+        parser.add_argument(
-            lmdeploy serve gradio ./workspace
+            'model_path_or_server',
+            type=str,
-        Example 2:
+            help='The path of the deployed model or the tritonserver url or '
-            lmdeploy serve gradio http://0.0.0.0:23333
+            'restful api url. for example: - ./workspace - 0.0.0.0:23333'
-            --server_name 0.0.0.0
+            ' - http://0.0.0.0:23333')
-            --server_port 6006
+        parser.add_argument('--server-name',
+                            type=str,
-        Example 3:
+                            default='0.0.0.0',
-            lmdeploy serve gradio ${triton_server_ip_addresss}:33337
+                            help='The ip address of gradio server')
+        parser.add_argument('--server-port',
-        Args:
+                            type=int,
-            model_path_or_server (str): the path of the deployed model or the
+                            default=6006,
-                tritonserver URL or restful api URL. The former is for directly
+                            help='The port of gradio server')
-                running service with gradio. The latter is for running with
-                tritonserver by default.
+        # common args
-            server_name (str): the ip address of gradio server
+        ArgumentHelper.backend(parser)
-            server_port (int): the port of gradio server
-            batch_size (int): batch size for running Turbomind directly
+        # chat template args
-            tp (int): tensor parallel for Turbomind
+        ArgumentHelper.meta_instruction(parser)  # TODO remove
-            kwargs (dict): extra params to init
+        ArgumentHelper.chat_template(parser)
-        """
+        ArgumentHelper.cap(parser)
+        # pytorch engine args
+        pt_group = parser.add_argument_group('PyTorch engine arguments')
+        # common engine args
+        tp_act = ArgumentHelper.tp(pt_group)
+        model_name_act = ArgumentHelper.model_name(pt_group)
+        session_len_act = ArgumentHelper.session_len(pt_group)
+        max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
+        cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
+        # turbomind args
+        tb_group = parser.add_argument_group('TurboMind engine arguments')
+        # common engine args
+        tb_group._group_actions.append(tp_act)
+        tb_group._group_actions.append(model_name_act)
+        tb_group._group_actions.append(session_len_act)
+        tb_group._group_actions.append(max_batch_size_act)
+        tb_group._group_actions.append(cache_max_entry_act)
+        ArgumentHelper.model_format(tb_group)
+        ArgumentHelper.quant_policy(tb_group)
+        ArgumentHelper.rope_scaling_factor(tb_group)
+    @staticmethod
+    def add_parser_api_server():
+        """Add parser for api_server command."""
+        parser = SubCliServe.subparsers.add_parser(
+            'api_server',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliServe.api_server.__doc__,
+            help=SubCliServe.api_server.__doc__)
+        parser.set_defaults(run=SubCliServe.api_server)
+        parser.add_argument(
+            'model_path',
+            type=str,
+            help='The path of a model. it could be one of the following '
+            'options: - i) a local directory path of a turbomind model'
+            ' which is converted by `lmdeploy convert` command or '
+            'download from ii) and iii). - ii) the model_id of a '
+            'lmdeploy-quantized model hosted inside a model repo on '
+            'huggingface.co, such as "internlm/internlm-chat-20b-4bit",'
+            ' "lmdeploy/llama2-chat-70b-4bit", etc. - iii) the model_id'
+            ' of a model hosted inside a model repo on huggingface.co,'
+            ' such as "internlm/internlm-chat-7b", "qwen/qwen-7b-chat "'
+            ', "baichuan-inc/baichuan2-7b-chat" and so on')
+        parser.add_argument('--server-name',
+                            type=str,
+                            default='0.0.0.0',
+                            help='Host ip for serving')
+        parser.add_argument('--server-port',
+                            type=int,
+                            default=23333,
+                            help='Server port')
+        parser.add_argument('--allow-origins',
+                            nargs='+',
+                            type=str,
+                            default=['*'],
+                            help='A list of allowed origins for cors')
+        parser.add_argument('--allow-credentials',
+                            action='store_true',
+                            help='Whether to allow credentials for cors')
+        parser.add_argument('--allow-methods',
+                            nargs='+',
+                            type=str,
+                            default=['*'],
+                            help='A list of allowed http methods for cors')
+        parser.add_argument('--allow-headers',
+                            nargs='+',
+                            type=str,
+                            default=['*'],
+                            help='A list of allowed http headers for cors')
+        parser.add_argument('--qos-config-path',
+                            type=str,
+                            default='',
+                            help='Qos policy config path')
+        # common args
+        ArgumentHelper.backend(parser)
+        ArgumentHelper.log_level(parser)
+        ArgumentHelper.api_keys(parser)
+        ArgumentHelper.ssl(parser)
+        # chat template args
+        ArgumentHelper.meta_instruction(parser)  # TODO remove
+        ArgumentHelper.chat_template(parser)
+        ArgumentHelper.cap(parser)
+        # pytorch engine args
+        pt_group = parser.add_argument_group('PyTorch engine arguments')
+        # common engine args
+        tp_act = ArgumentHelper.tp(pt_group)
+        model_name_act = ArgumentHelper.model_name(pt_group)
+        session_len_act = ArgumentHelper.session_len(pt_group)
+        max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
+        cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
+        # turbomind args
+        tb_group = parser.add_argument_group('TurboMind engine arguments')
+        # common engine args
+        tb_group._group_actions.append(tp_act)
+        tb_group._group_actions.append(model_name_act)
+        tb_group._group_actions.append(session_len_act)
+        tb_group._group_actions.append(max_batch_size_act)
+        tb_group._group_actions.append(cache_max_entry_act)
+        ArgumentHelper.model_format(tb_group)
+        ArgumentHelper.quant_policy(tb_group)
+        ArgumentHelper.rope_scaling_factor(tb_group)
+    @staticmethod
+    def add_parser_api_client():
+        """Add parser for api_client command."""
+        parser = SubCliServe.subparsers.add_parser(
+            'api_client',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliServe.api_client.__doc__,
+            help=SubCliServe.api_client.__doc__)
+        parser.set_defaults(run=SubCliServe.api_client)
+        parser.add_argument('api_server_url',
+                            type=str,
+                            help='The URL of api server')
+        parser.add_argument('--api-key',
+                            type=str,
+                            default=None,
+                            help='api key. Default to None, which means no '
+                            'api key will be used')
+        ArgumentHelper.session_id(parser)
+    @staticmethod
+    def add_parser_triton_client():
+        """Add parser for triton_client command."""
+        parser = SubCliServe.subparsers.add_parser(
+            'triton_client',
+            formatter_class=DefaultsAndTypesHelpFormatter,
+            description=SubCliServe.triton_client.__doc__,
+            help=SubCliServe.triton_client.__doc__)
+        parser.set_defaults(run=SubCliServe.triton_client)
+        parser.add_argument(
+            'tritonserver_addr',
+            type=str,
+            help='The address in format "ip:port" of triton inference server')
+        ArgumentHelper.session_id(parser)
+        ArgumentHelper.cap(parser)
+        ArgumentHelper.stream_output(parser)
+    @staticmethod
+    def gradio(args):
+        """Serve LLMs with web UI using gradio."""
+        from lmdeploy.archs import autoget_backend
+        from lmdeploy.messages import (PytorchEngineConfig,
+                                       TurbomindEngineConfig)
+        from lmdeploy.model import ChatTemplateConfig
        from lmdeploy.serve.gradio.app import run
-        run(model_path_or_server,
+        backend = args.backend
-            server_name=server_name,
-            server_port=server_port,
+        if backend != 'pytorch' and ':' not in args.model_path_or_server:
-            batch_size=batch_size,
+            # set auto backend mode
-            tp=tp,
+            backend = autoget_backend(args.model_path_or_server)
-            **kwargs)
+        if backend == 'pytorch':
+            backend_config = PytorchEngineConfig(
-    def api_server(self,
+                tp=args.tp,
-                   model_path: str,
+                model_name=args.model_name,
-                   model_name: Optional[str] = None,
+                max_batch_size=args.max_batch_size,
-                   server_name: str = '0.0.0.0',
+                cache_max_entry_count=args.cache_max_entry_count,
-                   server_port: int = 23333,
+                session_len=args.session_len)
-                   instance_num: int = 64,
+        else:
-                   tp: int = 1,
+            backend_config = TurbomindEngineConfig(
-                   allow_origins: List[str] = ['*'],
+                model_name=args.model_name,
-                   allow_credentials: bool = True,
+                tp=args.tp,
-                   allow_methods: List[str] = ['*'],
+                max_batch_size=args.max_batch_size,
-                   allow_headers: List[str] = ['*'],
+                session_len=args.session_len,
-                   **kwargs):
+                model_format=args.model_format,
-        """Serve LLMs with restful api using fastapi.
+                quant_policy=args.quant_policy,
+                rope_scaling_factor=args.rope_scaling_factor,
-        Args:
+                cache_max_entry_count=args.cache_max_entry_count)
-            model_path (str): the path of a model.
+        chat_template_config = ChatTemplateConfig(
-                It could be one of the following options:
+            model_name=args.model_name,
-                    - i) A local directory path of a turbomind model which is
+            meta_instruction=args.meta_instruction,
-                        converted by `lmdeploy convert` command or
+            capability=args.cap)
-                        download from ii) and iii).
+        if args.chat_template:
-                    - ii) The model_id of a lmdeploy-quantized model hosted
+            chat_template_config = ChatTemplateConfig.from_json(
-                        inside a model repo on huggingface.co, such as
+                args.chat_template)
-                        "InternLM/internlm-chat-20b-4bit",
+        run(args.model_path_or_server,
-                        "lmdeploy/llama2-chat-70b-4bit", etc.
+            server_name=args.server_name,
-                    - iii) The model_id of a model hosted inside a model repo
+            server_port=args.server_port,
-                        on huggingface.co, such as "InternLM/internlm-chat-7b",
+            backend=backend,
-                        "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+            backend_config=backend_config,
-                        and so on.
+            chat_template_config=chat_template_config)
-            model_name (str): needed when model_path is a pytorch model on
-                huggingface.co, such as "InternLM/internlm-chat-7b"
+    @staticmethod
-            server_name (str): host ip for serving
+    def api_server(args):
-            server_port (int): server port
+        """Serve LLMs with restful api using fastapi."""
-            instance_num (int): number of instances of turbomind model
+        from lmdeploy.archs import autoget_backend
-            tp (int): tensor parallel
+        from lmdeploy.model import ChatTemplateConfig
-            allow_origins (List[str]): a list of allowed origins for CORS
-            allow_credentials (bool): whether to allow credentials for CORS
-            allow_methods (List[str]): a list of allowed HTTP methods for CORS
-            allow_headers (List[str]): a list of allowed HTTP headers for CORS
-            kwargs (dict) extra params to init api server
-        """
        from lmdeploy.serve.openai.api_server import serve as run_api_server
+        backend = args.backend
+        if backend != 'pytorch':
+            # set auto backend mode
+            backend = autoget_backend(args.model_path)
+        if backend == 'pytorch':
+            from lmdeploy.messages import PytorchEngineConfig
+            backend_config = PytorchEngineConfig(
+                tp=args.tp,
+                model_name=args.model_name,
+                max_batch_size=args.max_batch_size,
+                cache_max_entry_count=args.cache_max_entry_count,
+                session_len=args.session_len)
+        else:
+            from lmdeploy.messages import TurbomindEngineConfig
+            backend_config = TurbomindEngineConfig(
+                model_name=args.model_name,
+                tp=args.tp,
+                max_batch_size=args.max_batch_size,
+                session_len=args.session_len,
+                model_format=args.model_format,
+                quant_policy=args.quant_policy,
+                rope_scaling_factor=args.rope_scaling_factor,
+                cache_max_entry_count=args.cache_max_entry_count)
+        chat_template_config = ChatTemplateConfig(
+            model_name=args.model_name,
+            meta_instruction=args.meta_instruction,
+            capability=args.cap)
+        if args.chat_template:
+            chat_template_config = ChatTemplateConfig.from_json(
+                args.chat_template)
+        run_api_server(args.model_path,
+                       backend=backend,
+                       backend_config=backend_config,
+                       chat_template_config=chat_template_config,
+                       server_name=args.server_name,
+                       server_port=args.server_port,
+                       allow_origins=args.allow_origins,
+                       allow_credentials=args.allow_credentials,
+                       allow_methods=args.allow_methods,
+                       allow_headers=args.allow_headers,
+                       log_level=args.log_level.upper(),
+                       api_keys=args.api_keys,
+                       ssl=args.ssl,
+                       qos_config_path=args.qos_config_path)
-        run_api_server(model_path,
+    @staticmethod
-                       model_name=model_name,
+    def api_client(args):
-                       server_name=server_name,
+        """Interact with restful api server in terminal."""
-                       server_port=server_port,
-                       instance_num=instance_num,
-                       tp=tp,
-                       allow_origins=allow_origins,
-                       allow_credentials=allow_credentials,
-                       allow_methods=allow_methods,
-                       allow_headers=allow_headers,
-                       **kwargs)
-    def api_client(self, restful_api_url: str, session_id: int = 0):
-        """Interact with restful api server in terminal.
-        Args:
-            restful_api_url: The restful api URL.
-            session_id: The identical id of a session.
-        """
        from lmdeploy.serve.openai.api_client import main as run_api_client
-        run_api_client(restful_api_url, session_id=session_id)
+        kwargs = convert_args(args)
+        run_api_client(**kwargs)
-    def triton_client(self,
-                      tritonserver_addr: str,
-                      session_id: int = 1,
-                      cap: str = 'chat',
-                      stream_output: bool = True,
-                      **kwargs):
-        """Interact with Triton Server using gRPC protocol.
-        Args:
-            tritonserver_addr (str): the address in format "ip:port" of
-              triton inference server
-            session_id (int): the identical id of a session
-            cap (str): the capability of a model. For example, codellama
-                has the ability among ['completion', 'infill', 'instruct',
-                'python']
-            stream_output (bool): indicator for streaming output or not
-            **kwargs (dict): other arguments for initializing model's
-                chat template
-        """
+    @staticmethod
+    def triton_client(args):
+        """Interact with Triton Server using gRPC protocol."""
        from lmdeploy.serve.client import main as run_triton_client
+        kwargs = convert_args(args)
+        run_triton_client(**kwargs)
-        run_triton_client(
+    @staticmethod
-            tritonserver_addr,
+    def add_parsers():
-            session_id=session_id,
+        SubCliServe.add_parser_gradio()
-            cap=cap,
+        SubCliServe.add_parser_api_server()
-            stream_output=stream_output,
+        SubCliServe.add_parser_api_client()
-            **kwargs,
+        SubCliServe.add_parser_triton_client()
-        )
--- a/lmdeploy/lite/apis/auto_awq.py
+++ b/lmdeploy/lite/apis/auto_awq.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from pathlib import Path
 import torch
 from torch import nn
-from transformers import AutoTokenizer
 from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
                                            quant_weights, smooth_layers)
-from lmdeploy.lite.utils import collect_target_modules, load_hf_from_pretrained
+from lmdeploy.lite.utils import collect_target_modules
+from .calibrate import calibrate
 # from lmdeploy.lite.utils.export_turbomind import export_turbomind_config
 LAYER_TYPE_MAP = {
    'InternLMForCausalLM': 'InternLMDecoderLayer',
+    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
    'QWenLMHeadModel': 'QWenBlock',
    'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
    'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
@@ -21,6 +21,7 @@ LAYER_TYPE_MAP = {
 }
 NORM_TYPE_MAP = {
    'InternLMForCausalLM': 'InternLMRMSNorm',
+    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
    'QWenLMHeadModel': 'RMSNorm',
    'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
    'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
@@ -29,30 +30,33 @@ NORM_TYPE_MAP = {
 def auto_awq(model: str,
-             work_dir: str,
+             work_dir: str = './work_dir',
+             calib_dataset: str = 'ptb',
+             calib_samples: int = 128,
+             calib_seqlen: int = 2048,
             w_bits: int = 4,
             w_sym: bool = False,
             w_group_size: int = 128,
             device: str = 'cuda'):
+    """Perform weight quantization using AWQ algorithm.
-    assert model != work_dir, '$WORK_DIR and $HF_MODEL should be different'
-    model_path = model  # noqa
+    Args:
+        model (str): The path of model in hf format.
-    # Load tokenizer and configuration
+        work_dir (str): The working directory to save results.
-    tokenizer = AutoTokenizer.from_pretrained(model,
+        calib_dataset (str): The calibration dataset name.
-                                              use_fast=False,
+        calib_samples (int): The number of samples for calibration.
-                                              trust_remote_code=True)
+        calib_seqlen (int): The sequence length for calibration.
+        w_bits (int): Bit number for weight quantization.
-    model = load_hf_from_pretrained(model,
+        w_sym (bool): Whether to do symmetric quantization.
-                                    torch_dtype=torch.float16,
+        w_group_size (int): Group size for weight quantization statistics.
-                                    trust_remote_code=True)
+        device (str): Device type of running.
+    """
+    model, tokenizer, work_dir = calibrate(model, calib_dataset, calib_samples,
+                                           calib_seqlen, work_dir, device)
    layer_type = LAYER_TYPE_MAP[type(model).__name__]
    fc2fcs = FC_FCS_MAP[layer_type]
    norm2fcs = NORM_FCS_MAP[layer_type]
-    work_dir = Path(work_dir)
    act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax']
    layers = collect_target_modules(model, layer_type)
    fcs = {}
@@ -68,11 +72,6 @@ def auto_awq(model: str,
                          safe_serialization=False)
    tokenizer.save_pretrained(work_dir)
-    # export_turbomind_config(model_name,
-    #                         model_path,
-    #                         work_dir,
-    #                         group_size=w_group_size)
 if __name__ == '__main__':
    import fire

--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -13,19 +13,31 @@ from lmdeploy.lite.utils import (collect_target_modules, get_calib_loaders,
 LAYER_TYPE_MAP = {
    'InternLMForCausalLM': 'InternLMDecoderLayer',
+    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
    'QWenLMHeadModel': 'QWenBlock',
    'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
    'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
    'LlamaForCausalLM': 'LlamaDecoderLayer',
 }
 NORM_TYPE_MAP = {
    'InternLMForCausalLM': 'InternLMRMSNorm',
+    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
    'QWenLMHeadModel': 'RMSNorm',
    'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
    'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
    'LlamaForCausalLM': 'LlamaRMSNorm',
 }
+HEAD_NAME_MAP = {
+    'InternLMForCausalLM': 'lm_head',
+    'InternLM2ForCausalLM': 'output',
+    'QWenLMHeadModel': 'lm_head',
+    'BaiChuanForCausalLM': 'lm_head',  # Baichuan 7B
+    'BaichuanForCausalLM': 'lm_head',  # Baichuan2 7B
+    'LlamaForCausalLM': 'lm_head',
+}
 def _prepare_for_calibrate(model: nn.Module,
                           layer_type: Union[str, type],
@@ -99,7 +111,7 @@ def _prepare_for_calibrate(model: nn.Module,
 def calibrate(model: str,
-              calib_dataset: str = 'c4',
+              calib_dataset: str = 'ptb',
              calib_samples: int = 128,
              calib_seqlen: int = 2048,
              work_dir: str = './work_dir',
@@ -110,7 +122,7 @@ def calibrate(model: str,
    Args:
        model (str): The name or path of the model to be loaded.
        calib_dataset (str, optional): The calibration dataset name.
-            Defaults to 'c4'.
+            Defaults to 'ptb'.
        calib_samples (int, optional): The number of samples for calibration.
            Defaults to 128.
        calib_seqlen (int, optional): The sequence length for calibration.
@@ -119,6 +131,11 @@ def calibrate(model: str,
            Defaults to './work_dir'.
        device (str, optional): The device to be used for calculation.
            Defaults to 'cuda'.
+    Returns:
+        model (nn.Module): The loaded huggingface model.
+        tokenizer : The loaded hugginface tokenizer.
+        work_dir (str): The working directory for outputs.
    """
    assert calib_dataset in ['c4', 'ptb', 'wikitext2', 'pileval'], \
@@ -152,7 +169,8 @@ def calibrate(model: str,
    layer_type = LAYER_TYPE_MAP[type(model).__name__]
    norm_type = NORM_TYPE_MAP[type(model).__name__]
-    _prepare_for_calibrate(model, layer_type, 'lm_head', device)
+    _prepare_for_calibrate(model, layer_type,
+                           HEAD_NAME_MAP[type(model).__name__], device)
    print('Loading calibrate dataset ...')
    calib_loader, _ = get_calib_loaders(calib_dataset,
@@ -179,6 +197,8 @@ def calibrate(model: str,
    work_dir.mkdir(parents=True, exist_ok=True)
    calib_ctx.export(work_dir)
+    return model, tokenizer, work_dir
 if __name__ == '__main__':
    import fire

--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -15,6 +15,10 @@ NORM_FCS_MAP = {
        ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
        'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
    },
+    'InternLM2DecoderLayer': {
+        'attention_norm': ['attention.wqkv'],
+        'ffn_norm': ['feed_forward.w1', 'feed_forward.w3']
+    },
    'QWenBlock': {
        'ln_1': ['attn.c_attn'],
        'ln_2': ['mlp.w1', 'mlp.w2']
@@ -34,6 +38,9 @@ FC_FCS_MAP = {
        'self_attn.v_proj': ['self_attn.o_proj'],
        'mlp.up_proj': ['mlp.down_proj']
    },
+    'InternLM2DecoderLayer': {
+        'feed_forward.w3': ['feed_forward.w2']
+    },
    'QWenBlock': {
        'attn.c_attn': ['attn.c_proj'],
        'mlp.w1': ['mlp.c_proj']
@@ -71,6 +78,13 @@ def smooth_ln_fcs(ln: torch.nn.Module,
    :return: Scales
    """
    device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
+    # If zeros exist within the weight of the layer norm, it becomes
+    # unnecessary to perform smooth quantization at the positions where
+    # these zeros occur.
+    zero_positions = (ln.weight == 0).nonzero(as_tuple=True)[0]
+    nonzero_positions = (ln.weight != 0).nonzero(as_tuple=True)[0]
    act_scales = act_scales.to(device=device, dtype=dtype)
    concat_w = torch.cat([fc.weight for fc in fcs], dim=0)
@@ -78,7 +92,11 @@ def smooth_ln_fcs(ln: torch.nn.Module,
    scales = (act_scales.pow(alpha) /
              w_scales.pow(1 - alpha)).to(device).to(dtype)
-    scales = scales / (scales.max() * scales.min()).sqrt()
+    scales = scales / (scales[nonzero_positions].max() *
+                       scales[nonzero_positions].min()).sqrt()
+    scales[zero_positions] = 1
    ln.weight.div_(scales)
    if hasattr(ln, 'bias'):
@@ -182,8 +200,8 @@ def check_awq_supported(layer_type):
 def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
    """Quantize the weights of the target model's linear layers."""
+    from lmdeploy.legacy.pytorch.modules import WeightOnlyQLinear
    from lmdeploy.lite.quantization import WeightQuantizer
-    from lmdeploy.pytorch.modules import WeightOnlyQLinear
    for name, fc in fcs.items():
        fc.to(device)
        quantizer = WeightQuantizer(bits, symmetry, 'per_group', group_size)

--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -3,6 +3,8 @@ from functools import partial
 from typing import Union
 import torch
+import transformers
+from mmengine import digit_version
 from torch import nn
 from transformers import PreTrainedTokenizer
@@ -53,7 +55,6 @@ class CalibrationContext():
        self.num_kv_heads = num_kv_heads
        self.head_dim = model.config.hidden_size // num_attn_heads
        self.model = model
-        del self.model.lm_head
        self.tokenizer = tokenizer
@@ -163,12 +164,36 @@ class CalibrationContext():
                if k_obs and v_obs:
                    batch_kwargs[i]['use_cache'] = True
-                    out = self._ori_forwards[mod](*batch_args[i],
+                    version = digit_version(transformers.__version__)
-                                                  **batch_kwargs[i])
+                    use_new_cache = type(mod).__name__ == 'LlamaDecoderLayer'
-                    out = list(out)
+                    if version > digit_version('4.36.0') and use_new_cache:
-                    key, value = out.pop(-1)
+                        from transformers.cache_utils import DynamicCache
-                    k_obs.observe(key)
+                        batch_kwargs[i]['past_key_value'] = DynamicCache()
-                    v_obs.observe(value)
+                        ori_idx = mod.self_attn.layer_idx
+                        mod.self_attn.layer_idx = 0
+                        out = self._ori_forwards[mod](*batch_args[i],
+                                                      **batch_kwargs[i])
+                        mod.self_attn.layer_idx = ori_idx
+                        out = list(out)
+                        cache = out.pop(-1)
+                        key = cache.key_cache.pop(-1)
+                        value = cache.value_cache.pop(-1)
+                        k_obs.observe(key)
+                        v_obs.observe(value)
+                    else:
+                        out = self._ori_forwards[mod](*batch_args[i],
+                                                      **batch_kwargs[i])
+                        out = list(out)
+                        key, value = out.pop(-1)
+                        k_obs.observe(key)
+                        v_obs.observe(value)
                    del key, value
                    torch.cuda.empty_cache()

--- a/lmdeploy/lite/utils/load.py
+++ b/lmdeploy/lite/utils/load.py
@@ -3,7 +3,7 @@
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM
-from lmdeploy.pytorch.model import LoadWoInit
+from lmdeploy.pytorch.accel import LoadNoInit
 def load_hf_from_pretrained(pretrained_model_name_or_path,
@@ -26,7 +26,7 @@ def load_hf_from_pretrained(pretrained_model_name_or_path,
    elif dtype == torch.bfloat16:
        hf_config.bf16 = True
-    with LoadWoInit():
+    with LoadNoInit():
        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path, config=hf_config, **kwargs)

--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import dataclasses
+import json
 from abc import abstractmethod
-from typing import List
+from typing import List, Literal, Optional
 from mmengine import Registry
+from lmdeploy.utils import get_logger
+logger = get_logger('lmdeploy')
 MODELS = Registry('model', locations=['lmdeploy.model'])
 @dataclasses.dataclass
-class SamplingParam:
+class ChatTemplateConfig:
-    top_p: float = 0.8
+    """Parameters for chat template.
-    top_k: float = None
-    temperature: float = 0.8
+    Args:
-    repetition_penalty: float = 1.0
+        model_name (str): the name of the deployed model. Determine which chat template will be applied.
+            All the chat template names: `lmdeploy list`
+        system (str | None): begin of the system prompt
+        meta_instruction (str | None): system prompt
+        eosys (str | None): end of the system prompt
+        user (str | None): begin of the user prompt
+        eoh (str | None): end of the user prompt
+        assistant (str | None): begin of the assistant prompt
+        eoa (str | None): end of the assistant prompt
+        capability: ('completion' | 'infilling' | 'chat' | 'python') = None
+    """  # noqa: E501
+    model_name: str
+    system: Optional[str] = None
+    meta_instruction: Optional[str] = None
+    eosys: Optional[str] = None
+    user: Optional[str] = None
+    eoh: Optional[str] = None
+    assistant: Optional[str] = None
+    eoa: Optional[str] = None
+    separator: Optional[str] = None
+    capability: Optional[Literal['completion', 'infilling', 'chat',
+                                 'python']] = None
+    stop_words: Optional[List[str]] = None
+    @property
+    def chat_template(self):
+        attrs = {
+            key: value
+            for key, value in dataclasses.asdict(self).items()
+            if value is not None
+        }
+        attrs.pop('model_name', None)
+        if self.model_name in MODELS.module_dict.keys():
+            model: BaseModel = MODELS.get(self.model_name)(**attrs)
+        else:
+            logger.warning(
+                f'Could not find {self.model_name} in registered models. '
+                f'Register {self.model_name} using the BaseChatTemplate.')
+            model = BaseChatTemplate(**attrs)
+        return model
+    def to_json(self, file_path=None):
+        """Convert the dataclass instance to a JSON formatted string and
+        optionally save to a file."""
+        json_str = json.dumps(dataclasses.asdict(self),
+                              ensure_ascii=False,
+                              indent=4)
+        if file_path:
+            with open(file_path, 'w', encoding='utf-8') as file:
+                file.write(json_str)
+        return json_str
+    @classmethod
+    def from_json(cls, file_or_string):
+        """Construct a dataclass instance from a JSON file or JSON string."""
+        try:
+            # Try to open the input_data as a file path
+            with open(file_or_string, 'r', encoding='utf-8') as file:
+                json_data = file.read()
+        except FileNotFoundError:
+            # If it's not a file path, assume it's a JSON string
+            json_data = file_or_string
+        except IOError:
+            # If it's not a file path and not a valid JSON string, raise error
+            raise ValueError(
+                'Invalid input. Must be a file path or a valid JSON string.')
+        json_data = json.loads(json_data)
+        assert json_data.get('model_name', None) is not None, \
+            'model_name is a must for json chat template.'
+        if json_data['model_name'] not in MODELS.module_dict.keys():
+            MODELS.register_module(json_data['model_name'],
+                                   module=BaseChatTemplate)
+        return cls(**json_data)
-@MODELS.register_module(name='internlm')
 @MODELS.register_module(name='llama')
 @MODELS.register_module(name='base')
 class BaseModel:
@@ -24,18 +100,10 @@ class BaseModel:
    def __init__(self,
                 session_len=2048,
-                 top_p=0.8,
-                 top_k=None,
-                 temperature=0.8,
-                 repetition_penalty=1.0,
                 capability='chat',
                 stop_words=None,
                 **kwargs):
        self.session_len = session_len
-        self.top_p = top_p
-        self.top_k = top_k
-        self.temperature = temperature
-        self.repetition_penalty = repetition_penalty
        self.stop_words = stop_words
        self.capability = capability
@@ -50,43 +118,8 @@ class BaseModel:
        Returns:
            str: the concatenated prompt
        """
-        if self.capability == 'completion':
-            return prompt
-        else:
-            return self.decorate_prompt(prompt, sequence_start)
-    @abstractmethod
-    def decorate_prompt(self, prompt, sequence_start):
        return prompt
-    @staticmethod
-    def _translate_messages(messages: List):
-        """Translate messages into system, user speaking list, assistant
-        speaking list.
-        Args:
-            messages (List): chat history
-        Returns:
-            Turple: consists of system (str), users (List[str]),
-                assistants (List[str])
-        """
-        system = None
-        users = []
-        assistants = []
-        assert isinstance(messages, List)
-        for message in messages:
-            msg_role = message['role']
-            if msg_role == 'system':
-                system = message['content']
-            elif msg_role == 'user':
-                users.append(message['content'])
-            elif msg_role == 'assistant':
-                assistants.append(message['content'])
-            else:
-                raise ValueError(f'Unknown role: {msg_role}')
-        assistants.append(None)
-        return system, users, assistants
    @abstractmethod
    def messages2prompt(self, messages, sequence_start=True):
        """Return the prompt that is concatenated with other elements in the
@@ -103,31 +136,40 @@ class BaseModel:
            return self.get_prompt(messages)
        # chat history processing in derived classes
-    @property
+    @classmethod
-    def sampling_param(self):
+    def match(cls, model_path: str) -> Optional[str]:
-        return SamplingParam(top_p=self.top_p,
+        """Return the model_name that was registered to MODELS.
-                             top_k=self.top_k,
-                             temperature=self.temperature,
-                             repetition_penalty=self.repetition_penalty)
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        return None
-@MODELS.register_module(name='wizardlM')
-@MODELS.register_module(name='vicuna')
-class Vicuna(BaseModel):
-    """Chat template of vicuna model."""
-    def __init__(
+class BaseChatTemplate(BaseModel):
-            self,
+    """Base Chat template."""
-            system="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. """,  # noqa: E501
-            user='USER',
+    def __init__(self,
-            assistant='ASSISTANT',
+                 system='',
-            **kwargs):
+                 meta_instruction='',
+                 eosys='',
+                 user='',
+                 eoh='',
+                 assistant='',
+                 eoa='',
+                 separator='',
+                 **kwargs):
        super().__init__(**kwargs)
        self.system = system
+        self.meta_instruction = meta_instruction
        self.user = user
+        self.eoh = eoh
+        self.eoa = eoa
+        self.separator = separator
+        self.eosys = eosys
        self.assistant = assistant
-    def decorate_prompt(self, prompt, sequence_start=True):
+    def get_prompt(self, prompt, sequence_start=True):
        """Return the prompt that is concatenated with other elements in the
        chat template.
@@ -138,12 +180,20 @@ class Vicuna(BaseModel):
        Returns:
            str: the concatenated prompt
        """
-        assert self.capability == 'chat', \
+        if self.capability == 'completion':
-            f'{type(self).__name__} has no capability of {self.capability}'
+            return prompt
        if sequence_start:
-            return f'{self.system} {self.user}: {prompt} {self.assistant}: '
+            # None is different from ''
+            if self.meta_instruction is not None:
+                return f'{self.system}{self.meta_instruction}{self.eosys}' \
+                    f'{self.user}{prompt}{self.eoh}' \
+                    f'{self.assistant}'
+            else:
+                return f'{self.user}{prompt}{self.eoh}' \
+                       f'{self.assistant}'
        else:
-            return f'</s>{self.user}: {prompt} {self.assistant}: '
+            return f'{self.separator}{self.user}{prompt}{self.eoh}' \
+                   f'{self.assistant}'
    def messages2prompt(self, messages, sequence_start=True):
        """Return the prompt that is concatenated with other elements in the
@@ -156,20 +206,65 @@ class Vicuna(BaseModel):
        """
        if isinstance(messages, str):
            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
+        box_map = dict(user=self.user,
-        system = self.system if not system else system
+                       assistant=self.assistant,
-        ret = system + ' '
+                       system=self.system)
-        for user, assistant in zip(users, assistants):
+        eox_map = dict(user=self.eoh,
-            if assistant:
+                       assistant=self.eoa + self.separator,
-                ret += f'{self.user}: {user} {self.assistant}: {assistant}</s>'
+                       system=self.eosys)
-            else:
+        ret = ''
-                ret += f'{self.user}: {user} {self.assistant}: '
+        if self.meta_instruction is not None:
+            if len(messages) and messages[0]['role'] != 'system':
+                ret += f'{self.system}{self.meta_instruction}{self.eosys}'
+        for message in messages:
+            role = message['role']
+            content = message['content']
+            ret += f'{box_map[role]}{content}{eox_map[role]}'
+        ret += f'{self.assistant}'
        return ret
+@MODELS.register_module(name='wizardlm')
+@MODELS.register_module(name='vicuna')
+class Vicuna(BaseChatTemplate):
+    """Chat template of vicuna model."""
+    def __init__(
+            self,
+            meta_instruction="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""",  # noqa: E501
+            eosys=' ',
+            user='USER: ',
+            eoh=' ',
+            assistant='ASSISTANT: ',
+            eoa='</s>',
+            stop_words=['</s>'],
+            **kwargs):
+        super().__init__(meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         stop_words=stop_words,
+                         **kwargs)
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        if 'vicuna' in model_path.lower():
+            return 'vicuna'
+        if 'wizardlm' in model_path.lower():
+            return 'wizardlm'
 @MODELS.register_module(name='internlm-chat')
 @MODELS.register_module(name='internlm-chat-7b')
-class InternLMChat7B(BaseModel):
+@MODELS.register_module(name='internlm')
+class InternLMChat7B(BaseChatTemplate):
    """Chat template of InternLM model."""
    def __init__(
@@ -179,67 +274,36 @@ class InternLMChat7B(BaseModel):
 - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
 """,  # noqa: E501
+            eosys='\n',
            user='<|User|>:',
            eoh='\n',
-            eoa='<eoa>\n',
-            eosys='\n',
            assistant='<|Bot|>:',
+            eoa='<eoa>',
+            separator='\n',
            stop_words=['<eoa>'],
            **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(system=system,
-        self.system = system
+                         meta_instruction=meta_instruction,
-        self.meta_instruction = meta_instruction
+                         eosys=eosys,
-        self.user = user
+                         user=user,
-        self.eoh = eoh
+                         eoh=eoh,
-        self.eoa = eoa
+                         assistant=assistant,
-        self.eosys = eosys
+                         eoa=eoa,
-        self.assistant = assistant
+                         separator=separator,
-        self.stop_words = stop_words
+                         stop_words=stop_words,
+                         **kwargs)
-    def decorate_prompt(self, prompt, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
-        Args:
-            prompt (str): user's input prompt
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
-        """
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.system}{self.meta_instruction}{self.eosys}' \
-                   f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-        else:
-            return f'\n{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-    def messages2prompt(self, messages, sequence_start=True):
+    @classmethod
-        """Return the prompt that is concatenated with other elements in the
+    def match(cls, model_path: str) -> Optional[str]:
-        chat template.
+        """Return the model_name that was registered to MODELS.
        Args:
-            messages (str | List): user's input prompt
+            model_path (str): the model path used for matching.
-        Returns:
-            str: the concatenated prompt
        """
+        path = model_path.lower()
-        if isinstance(messages, str):
+        if all([c not in path for c in ['internlm2', '8k']]) and \
-            return self.get_prompt(messages, sequence_start)
+                all([c in path for c in ['internlm', 'chat']]):
-        eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
+            return 'internlm'
-        ret = ''
-        if self.meta_instruction:
-            ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
-        for message in messages:
-            role = message['role']
-            content = message['content']
-            ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
-        ret += f'{self.assistant}'
-        return ret
 @MODELS.register_module(name='internlm-chat-20b')
@@ -254,7 +318,7 @@ class InternLMChat7B8K(InternLMChat7B):
 @MODELS.register_module(name='internlm-20b')
-class InternLMBaseModel20B(BaseModel):
+class InternLMBaseModel20B(BaseChatTemplate):
    """Generation parameters of InternLM-20B-Base model."""
    def __init__(self, session_len=4096, capability='completion', **kwargs):
@@ -263,71 +327,94 @@ class InternLMBaseModel20B(BaseModel):
                         **kwargs)
+@MODELS.register_module(
+    name=['internlm2-1_8b', 'internlm2-7b', 'internlm2-20b'])
+class InternLM2BaseModel7B(BaseChatTemplate):
+    """Generation parameters of InternLM2-7B-Base model."""
+    def __init__(self, session_len=32768, capability='completion', **kwargs):
+        super().__init__(session_len=session_len,
+                         capability=capability,
+                         **kwargs)
+@MODELS.register_module(name=[
+    'internlm2-chat', 'internlm2-chat-1_8b', 'internlm2-chat-7b',
+    'internlm2-chat-20b'
+])
+@MODELS.register_module(name='internlm2')
+class InternLM2Chat7B(InternLMChat7B):
+    """Chat template and generation parameters of InternLM2-Chat-7B."""
+    def __init__(self,
+                 session_len=32768,
+                 system='<|im_start|>system\n',
+                 user='<|im_start|>user\n',
+                 assistant='<|im_start|>assistant\n',
+                 eosys='<|im_end|>\n',
+                 eoh='<|im_end|>\n',
+                 eoa='<|im_end|>',
+                 separator='\n',
+                 stop_words=['<|im_end|>', '<|action_end|>'],
+                 **kwargs):
+        super(InternLM2Chat7B, self).__init__(session_len=session_len,
+                                              system=system,
+                                              user=user,
+                                              assistant=assistant,
+                                              eosys=eosys,
+                                              eoh=eoh,
+                                              eoa=eoa,
+                                              separator=separator,
+                                              stop_words=stop_words,
+                                              **kwargs)
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'internlm2' in path and ('chat' in path or 'math' in path):
+            return 'internlm2'
 @MODELS.register_module(name='baichuan-7b')
-class Baichuan7B(BaseModel):
+@MODELS.register_module(name='baichuan-base')
+class Baichuan7B(BaseChatTemplate):
    """Generation parameters of Baichuan-7B base model."""
-    def __init__(self, repetition_penalty=1.1, **kwargs):
+    def __init__(self, **kwargs):
        super().__init__(**kwargs)
-        self.repetition_penalty = repetition_penalty
 @MODELS.register_module(name='baichuan2-7b')
-class Baichuan2_7B(BaseModel):
+@MODELS.register_module(name='baichuan2')
+class Baichuan2_7B(BaseChatTemplate):
    """Chat template and generation parameters of Baichuan2-7B-Base and
    Baichuan2-7B-Chat models."""
    def __init__(self,
-                 temperature=0.3,
+                 user='<reserved_106>',
-                 top_k=5,
+                 assistant='<reserved_107>',
-                 top_p=0.85,
-                 repetition_penalty=1.05,
                 **kwargs):
-        super().__init__(temperature=temperature,
+        super().__init__(user=user, assistant=assistant, **kwargs)
-                         top_k=top_k,
-                         top_p=top_p,
-                         repetition_penalty=repetition_penalty,
-                         **kwargs)
-        self.user_token = '<reserved_106>'  # id = 195
-        self.assistant_token = '<reserved_107>'  # id = 196
-    def decorate_prompt(self, prompt, sequence_start=True):
+    @classmethod
-        """Return the prompt that is concatenated with other elements in the
+    def match(cls, model_path: str) -> Optional[str]:
-        chat template.
+        """Return the model_name that was registered to MODELS.
        Args:
-            prompt (str): user's input prompt
+            model_path (str): the model path used for matching.
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
        """
-        assert self.capability == 'chat', \
+        path = model_path.lower()
-            f'{type(self).__name__} has no capability of {self.capability}'
+        if 'baichuan2' in path and 'chat' in path:
-        return f'{self.user_token}{prompt}{self.assistant_token}'
+            return 'baichuan2'
-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
-        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
-        """
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        ret = ''
-        for user, assistant in zip(users, assistants):
-            ret += f'{self.user_token}{user}{self.assistant_token}'
-            if assistant:
-                ret += f'{assistant}'
-        return ret
 @MODELS.register_module(name='puyu')
-class Puyu(BaseModel):
+class Puyu(BaseChatTemplate):
    """Chat template of puyu model.This is only for internal usage in Shanghai
    AI Laboratory."""
@@ -341,217 +428,136 @@ class Puyu(BaseModel):
                 eoa='',
                 stop_words=None,
                 **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(meta_instruction=meta_instruction,
-        self.meta_instruction = meta_instruction
+                         system=system,
-        self.system = system
+                         eosys=eosys,
-        self.user = user
+                         user=user,
-        self.assistant = assistant
+                         eoh=eoh,
-        self.stop_words = stop_words
+                         assistant=assistant,
-        self.eosys = eosys
+                         eoa=eoa,
-        self.eoh = eoh
+                         stop_words=stop_words,
-        self.eoa = eoa
+                         **kwargs)
-    def decorate_prompt(self, prompt, sequence_start=True):
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.system}{self.meta_instruction}{self.eosys}' \
-                   f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-        else:
-            return f'{self.eoa}{self.user}{prompt}{self.eoh}{self.assistant}'
-    def messages2prompt(self, messages, sequence_start=True):
+    @classmethod
-        """Return the prompt that is concatenated with other elements in the
+    def match(cls, model_path: str) -> Optional[str]:
-        chat template.
+        """Return the model_name that was registered to MODELS.
        Args:
-            messages (str | List): user's input prompt
+            model_path (str): the model path used for matching.
-            sequence_start (bool): flag to start the sequence
-        Returns:
-            str: the concatenated prompt
        """
-        if isinstance(messages, str):
+        if 'puyu' in model_path.lower():
-            return self.get_prompt(messages, sequence_start)
+            return 'puyu'
-        eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
-        ret = ''
-        if self.meta_instruction:
-            ret += f'{self.system}{self.meta_instruction}{self.eosys}'
-        for message in messages:
-            role = message['role']
-            content = message['content']
-            ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
-        ret += f'{self.assistant}'
-        return ret
-@MODELS.register_module(name='llama2')
+@MODELS.register_module(name=['llama2', 'llama-2', 'llama-2-chat'])
-class Llama2(BaseModel):
+class Llama2(BaseChatTemplate):
    """Chat template of LLaMA2 model."""
    def __init__(
            self,
-            b_inst='[INST]',
+            system='[INST] <<SYS>>\n',
-            e_inst='[/INST]',
+            meta_instruction="""\
-            b_sys='<<SYS>>\n',
-            e_sys='\n<</SYS>>\n\n',
-            system="""\
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",  # noqa: E501
+            eosys='\n<</SYS>>\n\n',
+            assistant=' [/INST] ',
+            eoa='</s>',
+            separator='<s>[INST] ',
            session_len=4096,
            **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(system=system,
-        self.b_inst = b_inst
+                         meta_instruction=meta_instruction,
-        self.e_inst = e_inst
+                         eosys=eosys,
-        self.b_sys = b_sys
+                         assistant=assistant,
-        self.e_sys = e_sys
+                         eoa=eoa,
-        self.default_sys_prompt = system
+                         separator=separator,
-        self.session_len = session_len
+                         session_len=session_len,
+                         **kwargs)
-    def decorate_prompt(self, prompt, sequence_start=True):
+    @classmethod
-        """Return the prompt that is concatenated with other elements in the
+    def match(cls, model_path: str) -> Optional[str]:
-        chat template.
+        """Return the model_name that was registered to MODELS.
        Args:
-            prompt (str): user's input prompt
+            model_path (str): the model path used for matching.
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
        """
-        assert self.capability == 'chat', \
+        if 'llama-2' in model_path.lower() or 'llama2' in model_path.lower():
-            f'{type(self).__name__} has no capability of {self.capability}'
+            return 'llama2'
-        if sequence_start:
-            return f'{self.b_inst} ' \
-                   f'{self.b_sys} {self.default_sys_prompt} {self.e_sys}' \
-                   f'{prompt} {self.e_inst} '
-        return f'{self.b_inst} {prompt} {self.e_inst} '
-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
-        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
-        """
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        system = self.default_sys_prompt if not system else system
-        ret = f'{self.b_inst} {self.b_sys} {system} {self.e_sys}'
-        for i, (user, assistant) in enumerate(zip(users, assistants)):
-            if i != 0:
-                ret += f'{self.b_inst} '
-            if assistant:
-                ret += f'{user} {self.e_inst} {assistant}'
-            else:
-                ret += f'{user} {self.e_inst} '
-        return ret
 @MODELS.register_module(name='qwen-72b')
 @MODELS.register_module(name='qwen-14b')
 @MODELS.register_module(name='qwen-7b')
-class Qwen7BChat(BaseModel):
+@MODELS.register_module(name='qwen')
+class Qwen7BChat(BaseChatTemplate):
    """Chat template for Qwen-7B-Chat."""
    def __init__(self,
                 session_len=8192,
-                 top_p=0.5,
+                 system='<|im_start|>system\n',
-                 top_k=40,
+                 meta_instruction='You are a helpful assistant.',
-                 temperature=1.0,
+                 eosys='<|im_end|>\n',
-                 im_start='<|im_start|>',
+                 user='<|im_start|>user\n',
-                 im_end='<|im_end|>',
+                 eoh='<|im_end|>\n',
-                 system='You are a helpful assistant.',
+                 assistant='<|im_start|>assistant\n',
+                 eoa='<|im_end|>',
+                 separator='\n',
                 stop_words=['<|im_end|>'],
                 **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(system=system,
-        self.session_len = session_len
+                         meta_instruction=meta_instruction,
-        self.top_p = top_p
+                         eosys=eosys,
-        self.top_k = top_k
+                         user=user,
-        self.temperature = temperature
+                         eoh=eoh,
+                         assistant=assistant,
-        self.im_start = im_start
+                         eoa=eoa,
-        self.im_end = im_end
+                         separator=separator,
-        self.system = system
+                         stop_words=stop_words,
-        self.stop_words = stop_words
+                         session_len=session_len,
+                         **kwargs)
-    def decorate_prompt(self, prompt, sequence_start=True):
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.im_start}system\n{self.system}{self.im_end}' \
-                   f'\n{self.im_start}user\n{prompt}{self.im_end}' \
-                   f'\n{self.im_start}assistant\n'
-        return f'\n{self.im_start}user\n{prompt}{self.im_end}' \
-               f'\n{self.im_start}assistant\n'
-    def messages2prompt(self, messages, sequence_start=True):
+    @classmethod
-        """Return the prompt that is concatenated with other elements in the
+    def match(cls, model_path: str) -> Optional[str]:
-        chat template.
+        """Return the model_name that was registered to MODELS.
        Args:
-            messages (str | List): user's input prompt
+            model_path (str): the model path used for matching.
-        Returns:
-            str: the concatenated prompt
        """
-        if isinstance(messages, str):
+        if 'qwen' in model_path.lower():
-            return self.get_prompt(messages, sequence_start)
+            return 'qwen'
-        system, users, assistants = self._translate_messages(messages)
-        system = self.system if not system else system
-        ret = f'{self.im_start}system\n{system}{self.im_end}'
-        for user, assistant in zip(users, assistants):
-            if assistant:
-                ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
-                       f'\n{self.im_start}assistant\n{assistant}'
-            else:
-                ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
-                       f'\n{self.im_start}assistant\n'
-        return ret
 @MODELS.register_module(name='codellama')
 class CodeLlama(Llama2):
    def __init__(self,
-                 system='',
+                 meta_instruction='',
                 session_len=4096,
                 suffix_first=False,
                 stop_words=None,
                 **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(meta_instruction=meta_instruction,
+                         session_len=session_len,
+                         stop_words=stop_words,
+                         **kwargs)
        caps = ['completion', 'infilling', 'chat', 'python']
        assert self.capability in caps, \
            f'{self.capability} is not supported. ' \
            f'The supported capabilities are: {caps}'
-        self.default_sys_prompt = system
+        self.meta_instruction = meta_instruction
        self.session_len = session_len
        self.suffix_first = suffix_first
        self.stop_words = stop_words
+        if self.capability == 'infilling':
-        # The following sampling parameters refers to https://github.com/facebookresearch/codellama # noqa: E501
-        if self.capability == 'completion' or self.capability == 'python':
-            self.top_p = kwargs.get('top_p', 0.9)
-            self.temperature = kwargs.get('temperature', 0.2)
-        if self.capability == 'chat':
-            self.top_p = kwargs.get('top_p', 0.95)
-            self.temperature = kwargs.get('temperature', 0.2)
-        elif self.capability == 'infilling':
-            self.top_p = kwargs.get('top_p', 0.9)
-            self.temperature = kwargs.get('temperature', 0.0)
            if self.stop_words is None:
                self.stop_words = ['<EOT>']
-    def decorate_prompt(self, prompt, sequence_start=True):
+    def get_prompt(self, prompt, sequence_start=True):
        if self.capability == 'infilling':
            return self._infill_prompt(prompt)
        elif self.capability == 'chat':
-            return self._get_prompt(prompt, sequence_start)
+            return super().get_prompt(prompt, sequence_start)
        else:  # python speicalist
            return prompt
@@ -565,92 +571,130 @@ class CodeLlama(Llama2):
            prompt = f'<PRE> {prefix} <SUF>{suffix} <MID>'
        return prompt
-    def _get_prompt(self, prompt, sequence_start):
+    @classmethod
-        prompt = prompt.strip()
+    def match(cls, model_path: str) -> Optional[str]:
-        if sequence_start:
+        """Return the model_name that was registered to MODELS.
-            return f'{self.b_inst} ' \
-                   f'{self.b_sys}{self.default_sys_prompt}{self.e_sys}' \
+        Args:
-                   f'{prompt} {self.e_inst}'
+            model_path (str): the model path used for matching.
+        """
+        if 'codellama' in model_path.lower():
+            return 'codellama'
+@MODELS.register_module(name='falcon')
+class Falcon(BaseModel):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        if 'falcon' in model_path.lower():
+            return 'falcon'
+@MODELS.register_module(name='chatglm2-6b')
+@MODELS.register_module(name='chatglm')
+class ChatGLM2(BaseModel):
+    def __init__(self,
+                 user='问：',
+                 eoh='\n\n',
+                 assistant='答：',
+                 eoa='\n\n',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self._user = user
+        self._assistant = assistant
+        self._eoh = eoh
+        self._eoa = eoa
+        self.count = 0
-        return f'{self.b_inst} {prompt} {self.e_inst}'
+    def get_prompt(self, prompt, sequence_start=True):
+        """get prompt."""
+        # need more check
+        # https://github.com/THUDM/ChatGLM2-6B/issues/48
+        # [64790, 64792] to be prepended
+        self.count += 1
+        ret = f'[Round {self.count}]\n\n'
+        ret += f'{self._user}{prompt}{self._eoh}'
+        ret += f'{self._assistant}'
+        return ret
    def messages2prompt(self, messages, sequence_start=True):
-        assert self.capability == 'chat', \
+        """message to prompt."""
-            f'codellama message2prompt only supports chat mode ' \
+        if isinstance(messages, str):
-            f'but got {self.cap} mode'
+            return self.get_prompt(messages, sequence_start)
-        return super().messages2prompt(messages, sequence_start)
+        ret = ''
+        count = 0
+        for message in messages:
+            role = message['role']
+            content = message['content']
+            if role == 'user':
+                count += 1
+                ret += f'[Round {count}]\n\n'
+                ret += f'{self._user}{content}{self._eoh}'
+                ret += f'{self._assistant}'
+            if role == 'assistant':
+                ret += f'{content}'
+        return ret
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        if 'chatglm' in model_path.lower():
+            return 'chatglm'
-@MODELS.register_module(name='solar')
+@MODELS.register_module(name=['solar', 'solar-70b'])
-class SOLAR(BaseModel):
+class SOLAR(BaseChatTemplate):
    """Chat template of SOLAR model.
    `https://huggingface.co/upstage/SOLAR-0-70b-16bit`
    """
    def __init__(self,
-                 b_sys='### System:\n',
+                 system='### System:\n',
-                 e_sys='\n\n',
+                 eosys='\n\n',
                 user='### User:\n',
                 eoh='\n\n',
                 assistant='### Assistant:\n',
-                 eoa='\n\n',
+                 meta_instruction='',
-                 system='',
                 session_len=2048,
                 **kwargs):
        super().__init__(**kwargs)
-        self.b_sys = b_sys
+        self.system = system
-        self.e_sys = e_sys
+        self.eosys = eosys
        self.user = user
        self.eoh = eoh
        self.assistant = assistant
-        self.eoa = eoa
+        self.meta_instruction = meta_instruction
-        self.system = system
        self.session_len = session_len
-    def decorate_prompt(self, prompt, sequence_start=True):
+    @classmethod
-        """Return the prompt that is concatenated with other elements in the
+    def match(cls, model_path: str) -> Optional[str]:
-        chat template.
+        """Return the model_name that was registered to MODELS.
        Args:
-            prompt (str): user's input prompt
+            model_path (str): the model path used for matching.
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
        """
-        assert self.capability == 'chat', \
+        if 'solar' in model_path.lower():
-            f'{type(self).__name__} has no capability of {self.capability}'
+            return 'solar'
-        if sequence_start:
-            return f'{self.b_sys}{self.system}{self.e_sys}' \
-                   f'{self.user}{prompt}{self.eoh}{self.assistant}'
-        return f'{self.user}{prompt}{self.eoh}{self.assistant}'
-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
-        chat template.
-        Args:
-            messages (str | List): user's input prompt
-        Returns:
-            str: the concatenated prompt
-        """
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        system, users, assistants = self._translate_messages(messages)
-        system = self.system if not system else system
-        ret = f'{self.b_sys}{system}{self.e_sys}'
-        for i, (user, assistant) in enumerate(zip(users, assistants)):
-            ret += f'{self.user}{user}{self.eoh}{self.assistant}'
-            if assistant:
-                ret += f'{assistant}{self.eoa}'
-        return ret
 @MODELS.register_module(name='ultracm')
 @MODELS.register_module(name='ultralm')
-class UltraChat(BaseModel):
+class UltraChat(BaseChatTemplate):
    """Template of UltraCM and UltraLM models.
    `https://huggingface.co/openbmb/UltraCM-13b`
@@ -659,147 +703,222 @@ class UltraChat(BaseModel):
    def __init__(
            self,
-            system="""User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.</s>""",  # noqa: E501
+            system='User: ',
-            eos='</s>',
+            meta_instruction="""A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.""",  # noqa: E501
+            eosys='</s>\n',
            user='User: ',
+            eoh='</s>\n',
            assistant='Assistant: ',
+            eoa='</s>',
+            separator='\n',
+            stop_words=['</s>'],
            session_len=2048,
            **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(system=system,
-        self.system = system
+                         meta_instruction=meta_instruction,
-        self.eos = eos
+                         eosys=eosys,
-        self.session_len = session_len
+                         user=user,
-        self.user = user
+                         eoh=eoh,
-        self.assistant = assistant
+                         assistant=assistant,
+                         eoa=eoa,
-    def decorate_prompt(self, prompt, sequence_start=True):
+                         separator=separator,
-        """Return the prompt that is concatenated with other elements in the
+                         stop_words=stop_words,
-        chat template.
+                         session_len=session_len,
+                         **kwargs)
-        Args:
-            prompt (str): the input prompt
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
-        """
-        assert self.capability == 'chat', \
-            f'{type(self).__name__} has no capability of {self.capability}'
-        if sequence_start:
-            return f'{self.system}\n{self.user}{prompt}{self.eos}' \
-                   f'\n{self.assistant}'
-        return f'\n{self.user}{prompt}{self.eos}' \
-               f'\n{self.assistant}'
-    def messages2prompt(self, messages, sequence_start=True):
+    @classmethod
-        """Return the prompt that is concatenated with other elements in the
+    def match(cls, model_path: str) -> Optional[str]:
-        chat template. Only evaluate the last instruction completion pair.
+        """Return the model_name that was registered to MODELS.
        Args:
-            messages (str | List): user's input prompt
+            model_path (str): the model path used for matching.
-        Returns:
-            str: the concatenated prompt
        """
-        if isinstance(messages, str):
+        if 'ultracm' in model_path.lower():
-            return self.get_prompt(messages, sequence_start)
+            return 'ultracm'
-        system, users, assistants = self._translate_messages(messages)
+        if 'ultralm' in model_path.lower():
-        system = self.system if not system else system
+            return 'ultralm'
-        ret = f'{system}'
-        for user, assistant in zip(users, assistants):
-            if assistant:
-                ret += f'\n{self.user}{user}{self.eos}' \
-                       f'\n{self.assistant}{assistant}{self.eos}'
-            else:
-                ret += f'\n{self.user}{user}{self.eos}' \
-                       f'\n{self.assistant}'
-        return ret
-@MODELS.register_module(name='yi')
+@MODELS.register_module(name=['yi', 'yi-chat', 'yi-200k', 'yi-34b'])
-class Yi(BaseModel):
+class Yi(BaseChatTemplate):
    """Chat template of Yi model."""
    def __init__(self,
                 system='<|im_start|>system\n',
                 meta_instruction=None,
+                 eosys='<|im_end|>\n',
                 user='<|im_start|>user\n',
                 eoh='<|im_end|>\n',
-                 eoa='<|im_end|>\n',
-                 eosys='<|im_end|>\n',
                 assistant='<|im_start|>assistant\n',
+                 eoa='<|im_end|>',
+                 separator='\n',
                 stop_words=['<|im_end|>', '<|endoftext|>'],
                 **kwargs):
-        super().__init__(**kwargs)
+        super().__init__(system=system,
-        self.system = system
+                         meta_instruction=meta_instruction,
-        self.meta_instruction = meta_instruction
+                         eosys=eosys,
-        self.user = user
+                         user=user,
-        self.eoh = eoh
+                         eoh=eoh,
-        self.eoa = eoa
+                         assistant=assistant,
-        self.eosys = eosys
+                         eoa=eoa,
-        self.assistant = assistant
+                         separator=separator,
-        self.stop_words = stop_words
+                         stop_words=stop_words,
+                         **kwargs)
-    def decorate_prompt(self, prompt, sequence_start=True):
+    @classmethod
-        """Return the prompt that is concatenated with other elements in the
+    def match(cls, model_path: str) -> Optional[str]:
-        chat template.
+        """Return the model_name that was registered to MODELS.
        Args:
-            prompt (str): user's input prompt
+            model_path (str): the model path used for matching.
-            sequence_start (bool): indicator for the first round chat of a
-               session sequence
-        Returns:
-            str: the concatenated prompt
        """
-        assert self.capability == 'chat', \
+        path = model_path.lower()
-            f'{type(self).__name__} has no capability of {self.capability}'
+        if 'yi' in path and 'vl' not in path:
-        if sequence_start:
+            return 'yi'
-            if self.meta_instruction is None:
-                return f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-            return f'{self.system}{self.meta_instruction}{self.eosys}' \
-                   f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-        else:
-            return f'{self.user}{prompt}{self.eoh}' \
-                   f'{self.assistant}'
-    def messages2prompt(self, messages, sequence_start=True):
-        """Return the prompt that is concatenated with other elements in the
+@MODELS.register_module(name=['mistral', 'mixtral'])
-        chat template.
+@MODELS.register_module(name=['Mistral-7B-Instruct', 'Mixtral-8x7B-Instruct'])
+class MistralChat(BaseChatTemplate):
+    """Template of Mistral and Mixtral Instruct models.
+    `https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1`
+    `https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1`
+    """
+    def __init__(self,
+                 user='[INST] ',
+                 eoh=' [/INST]',
+                 eoa='</s>',
+                 session_len=2048,
+                 **kwargs):
+        super().__init__(user=user,
+                         eoh=eoh,
+                         eoa=eoa,
+                         session_len=session_len,
+                         **kwargs)
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
        Args:
-            messages (str | List): user's input prompt
+            model_path (str): the model path used for matching.
-        Returns:
-            str: the concatenated prompt
        """
+        if 'instruct' in model_path.lower():
+            if 'mistral' in model_path.lower():
+                return 'mistral'
+            if 'mixtral' in model_path.lower():
+                return 'mixtral'
-        if isinstance(messages, str):
-            return self.get_prompt(messages, sequence_start)
-        eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
-        ret = ''
-        if self.meta_instruction:
-            ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
-        for message in messages:
+@MODELS.register_module(name=['gemma'])
-            role = message['role']
+class Gemma(BaseChatTemplate):
-            content = message['content']
+    """Template of Gemma models.
-            ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
-        ret += f'{self.assistant}'
+    `https://huggingface.co/google/gemma-7b-it`
-        return ret
+    """
+    def __init__(self,
+                 user='<start_of_turn>user\n',
+                 eoh='<end_of_turn>\n',
+                 assistant='<start_of_turn>model\n',
+                 eoa='<end_of_turn>\n',
+                 **kwargs):
+        super().__init__(user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         **kwargs)
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        if 'gemma' in model_path.lower():
+            return 'gemma'
-def main(model_name: str = 'test'):
+@MODELS.register_module(name=['deepseek-chat'])
-    assert model_name in MODELS.module_dict.keys(), \
+@MODELS.register_module(name=['deepseek'])
-        f"'{model_name}' is not supported. " \
+class Deepseek(BaseChatTemplate):
-        f'The supported models are: {MODELS.module_dict.keys()}'
-    model = MODELS.get(model_name)()
-    prompt = model.get_prompt(prompt='hi')
-    print(prompt)
-    print(f'session_len: {model.session_len}')
+    def __init__(self,
+                 user='User: ',
+                 eoh='\n\n',
+                 assistant='Assistant: ',
+                 eoa='<｜end▁of▁sentence｜>',
+                 **kwargs):
+        super().__init__(user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         **kwargs)
-if __name__ == '__main__':
+    @classmethod
-    import fire
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
-    fire.Fire(main)
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'deepseek' in path and 'chat' in path:
+            return 'deepseek'
+@MODELS.register_module(name=['yi-vl'])
+class YiVL(BaseChatTemplate):
+    def __init__(
+            self,
+            meta_instruction="""This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers. 这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n""",  # noqa: E501
+            user='### Human: ',
+            eoh='\n',
+            assistant='### Assistant:',
+            eoa='\n',
+            stop_words=['###'],
+            **kwargs):
+        super().__init__(meta_instruction=meta_instruction,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         stop_words=stop_words,
+                         **kwargs)
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'yi-vl' in path:
+            return 'yi-vl'
+def best_match_model(query: str) -> Optional[str]:
+    """Get the model that matches the query.
+    Args:
+        query (str): the input query. Could be a model path.
+    Return:
+        str | None: the possible model name or none.
+    """
+    for name, model in MODELS.module_dict.items():
+        if model.match(query):
+            return model.match(query)
+    try:
+        from transformers import AutoTokenizer
+        tokenizer_config = AutoTokenizer.from_pretrained(
+            query, trust_remote_code=True)
+        if tokenizer_config.chat_template is None:
+            return 'base'
+    except Exception as e:
+        assert type(e) == OSError
--- a/lmdeploy/pytorch/__init__.py
+++ b/lmdeploy/pytorch/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-"""Chat with torch models."""
--- a/lmdeploy/pytorch/accel.py
+++ b/lmdeploy/pytorch/accel.py
@@ -13,6 +13,7 @@ class LoadNoInit:
        self.normal_ = torch.nn.init.normal_
        self.kaiming_uniform_ = torch.nn.init.kaiming_uniform_
        self.kaiming_normal_ = torch.nn.init.kaiming_normal_
+        self.tensor_normal_ = torch.Tensor.normal_
    def __enter__(self, *args, **kwargs):
        """Replace initializers with no-op."""
@@ -24,6 +25,7 @@ class LoadNoInit:
        torch.nn.init.normal_ = lambda *args, **kwargs: None
        torch.nn.init.kaiming_uniform_ = lambda *args, **kwargs: None
        torch.nn.init.kaiming_normal_ = lambda *args, **kwargs: None
+        torch.Tensor.normal_ = lambda *args, **kwargs: None
    def __exit__(self, *args, **kwargs):
        """Recover."""
@@ -35,3 +37,4 @@ class LoadNoInit:
        torch.nn.init.normal_ = self.normal_
        torch.nn.init.kaiming_uniform_ = self.kaiming_uniform_
        torch.nn.init.kaiming_normal_ = self.kaiming_normal_
+        torch.Tensor.normal_ = self.tensor_normal_
--- a/lmdeploy/pytorch/adapters/__init__.py
+++ b/lmdeploy/pytorch/adapters/__init__.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import torch.nn as nn
-from .base import BasicAdapter, BasicAdapterFast
-from .internlm import InternLMAdapter
-from .llama2 import Llama2Adapter
-logger = logging.getLogger(__name__)
-def _get_default_adapter(tokenizer):
-    if tokenizer.is_fast:
-        return BasicAdapterFast
-    else:
-        return BasicAdapter
-def init_adapter(model: nn.Module, tokenizer, adapter=None):
-    if adapter is None:
-        for v in model.modules():
-            if 'InternLMModel' in v.__class__.__name__:
-                Adapter = InternLMAdapter
-                break
-            elif 'LlamaModel' in v.__class__.__name__:
-                Adapter = Llama2Adapter
-                break
-        else:
-            Adapter = _get_default_adapter(tokenizer)
-    elif adapter == 'llama1':
-        Adapter = _get_default_adapter(tokenizer)
-    else:
-        raise ValueError(f'Adapter {adapter} is not allowed.')
-    logger.info(f'Using adapter {Adapter.__name__}')
-    return Adapter(tokenizer)
--- a/lmdeploy/pytorch/adapters/base.py
+++ b/lmdeploy/pytorch/adapters/base.py
-# Copyright (c) OpenMMLab. All rights reserved.
-"""Basic adapter suitable for general HuggingFace models."""
-import logging
-import re
-from transformers import (PreTrainedTokenizer, PreTrainedTokenizerBase,
-                          PreTrainedTokenizerFast)
-logger = logging.getLogger(__name__)
-class BaseAdapter:
-    """Base class for all adapters.
-    Note:
-        Adapters coordinate with the session manager to prepare input_ids.
-        The full sequence fed to the model is as follows:
-            ```
-            adapter.start_ids
-            adapter.encode_and_decorate(user_input_1)
-            output_1_generated_by_model
-            adapter.sep_ids
-            adapter.encode_and_decorate(user_input_2)
-            output_2_generated_by_model
-            adapter.sep_ids
-            adapter.encode_and_decorate(user_input_3)
-            ```
-        Thus adapter is responsible for providing model specific
-        ``start_ids``, ``sep_ids``, and method to encode single prompt.
-    """
-    def __init__(self, tokenizer: PreTrainedTokenizerBase):
-        self.tokenizer = tokenizer
-    def encode_and_decorate(self, prompt, add_special_tokens=False):
-        """Model specific method to encode and decorate prompt."""
-        raise NotImplementedError
-    def decode(self, value):
-        """Model specific method to decode single value to string."""
-        raise NotImplementedError
-    @property
-    def stopping_criteria(self):
-        """Model specific stopping criteria for generation."""
-        return None
-    @property
-    def start_ids(self):
-        """Model specific start ids."""
-        return [self.tokenizer.bos_token_id]
-    @property
-    def sep_ids(self):
-        """Model specific separation ids."""
-        return [self.tokenizer.bos_token_id]
-class BasicAdapter(BaseAdapter):
-    """Basic adapter for slow tokenizers."""
-    def encode_and_decorate(self, prompt, add_special_tokens=False):
-        """Encode prompt.
-        Note:
-            we leave <bos> to session manager to add.
-        """
-        input_ids = self.tokenizer.encode(
-            prompt,
-            add_special_tokens=add_special_tokens,
-            return_tensors='pt',
-        )
-        logger.debug(f'Encode {prompt} to {input_ids}')
-        return input_ids
-    def decode(self, value):
-        """Fallback when tokenizer is not fast."""
-        self.tokenizer: PreTrainedTokenizer
-        tok = self.tokenizer.decode(value)
-        return tok + ' '
-class BasicAdapterFast(BaseAdapter):
-    """Basic adapter for slow tokenizers."""
-    hex_regex = re.compile(r'^<0x([0-9ABCDEF]+)>$')
-    def encode_and_decorate(self, prompt, add_special_tokens=False):
-        """Encode prompt.
-        Note:
-            we leave <bos> to session manager to add.
-        """
-        input_ids = self.tokenizer.encode(
-            prompt,
-            add_special_tokens=add_special_tokens,
-            return_tensors='pt',
-        )
-        logger.debug(f'Encode {prompt} to {input_ids}')
-        return input_ids
-    def decode(self, value):
-        """Decode with fast tokenizers."""
-        self.tokenizer: PreTrainedTokenizerFast
-        tok = self.tokenizer._convert_id_to_token(value)
-        if tok.startswith('▁'):  # sentencepiece
-            space = ' '
-            tok = tok[1:]
-        else:
-            space = ''
-        if res := self.hex_regex.match(tok):
-            tok = chr(int(res.group(1), 16))
-        if tok == '</s>' or tok == '\r':
-            tok = '\n'
-        tok = space + tok
-        logger.debug(f'Decode {value} to {repr(tok)}')
-        return tok
--- a/lmdeploy/pytorch/adapters/internlm.py
+++ b/lmdeploy/pytorch/adapters/internlm.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import re
-import torch
-from transformers import (PreTrainedTokenizerFast, StoppingCriteria,
-                          StoppingCriteriaList)
-from .base import BaseAdapter
-logger = logging.getLogger(__name__)
-class InternLMStoppingCriteria(StoppingCriteria):
-    """Stopping criteria for HF version of InternLM."""
-    def __call__(self, input_ids, *args, **kwargs) -> bool:
-        return input_ids[0, -1] in [2, 103028]
-class InternLMAdapter(BaseAdapter):
-    """Adapter for InternLM.
-    InternLM use the following template and \n should be 13.
-        <bos> (no actual newline here, just for better readability)
-        <|User|>:{prompt}<eoh>\n
-        <|Bot|>:{model_output}<eoa>\n
-        <|User|>:{prompt}<eoh>\n
-        <|Bot|>:{model_output}<eoa>\n
-        ...
-        <eos>
-    """
-    hex_regex = re.compile(r'^<0x([0-9ABCDEF]+)>$')
-    # ids of '<|User|>:'
-    B_USER_ID = torch.tensor([[333, 352, 1621, 352, 27232]])
-    # ids of '<eoh>\n<|Bot|>:'
-    E_USER_ID = torch.tensor([[103027, 13, 333, 352, 23845, 352, 27232]])
-    # ids of '<bos>'
-    start_ids = [1]
-    # ids of '\n'
-    sep_ids = [13]
-    def __init__(self, tokenizer: PreTrainedTokenizerFast):
-        self.tokenizer = tokenizer
-    def encode_and_decorate(self, prompt):
-        r"""Encode prompt and decorate with template.
-        Note:
-            we leave <bos> and chat history for session manager to add,
-        so we will decorate input_ids to '<|User|>:{prompt}<eoh>\n<|Bot|>:'
-        """
-        input_ids = self.tokenizer.encode(
-            prompt,
-            add_special_tokens=False,
-            return_tensors='pt',
-        )
-        # This is f'<|User|>:{prompt}<eoh>\n<|Bot|>:'
-        # but force \n to 13 instead of 364
-        input_ids = torch.cat([self.B_USER_ID, input_ids, self.E_USER_ID],
-                              dim=1)
-        return input_ids
-    def decode(self, value):
-        """Decode generated tokens for InternLM."""
-        tok = self.tokenizer.decode(value)
-        if res := self.hex_regex.match(tok):
-            tok = chr(int(res.group(1), 16))
-        if tok == '</s>' or tok == '<eoa>' or tok == '\r':
-            tok = '\n'
-        logger.debug(f'Decode {value} to {repr(tok)}')
-        return tok
-    @property
-    def stopping_criteria(self):
-        return StoppingCriteriaList([InternLMStoppingCriteria()])
--- a/lmdeploy/pytorch/adapters/llama2.py
+++ b/lmdeploy/pytorch/adapters/llama2.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import re
-from transformers import PreTrainedTokenizerFast
-from .base import BasicAdapterFast
-logger = logging.getLogger(__name__)
-B_INST, E_INST = '[INST]', '[/INST]'
-B_SYS, E_SYS = '<<SYS>>\n', '\n<</SYS>>\n\n'
-DEFAULT_SYSTEM_PROMPT = """\
-You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""   # noqa: E501
-class Llama2Adapter(BasicAdapterFast):
-    """Adapter for llama2.
-    Llama2 use the following template and the first user prompt
-    should contain a system prompt.
-    User can specify the system prompt using a <<SYS>> tag otherwise
-    the default system prompt is prepended to user's input.
-        <bos>
-        [INST]<space>
-        <<SYS>>\n
-        SYSTEM_PROMPT\n
-        <</SYS>>\n\n
-        {user_prompt_1}<space>
-        [/INST]<space>
-        {answer_1}<space>
-        <eos>
-        <bos>
-        [INST]<space>
-        {user_prompt_2}<space>
-        [/INST]<space>
-        {answer_2}<space>
-        <eos>
-        <bos>
-        [INST]<space>
-        {user_prompt_2}(no space here)
-        ...
-    """
-    start_ids = []
-    sep_ids = []
-    def __init__(self, tokenizer: PreTrainedTokenizerFast):
-        super().__init__(tokenizer)
-        self.prev_round = 0
-    def encode_and_decorate(self, prompt):
-        r"""Encode prompt and decorate with template."""
-        if self.prev_round == 0:
-            res = re.search(r'<<SYS>>(.*?)<</SYS>>(.*)', prompt)
-            if res:
-                prompt = B_SYS + res.group(1).strip() + \
-                    E_SYS + res.group(2).strip()
-            else:
-                prompt = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + prompt
-        prompt = f'{B_INST} {prompt.strip()} {E_INST}'
-        logger.debug(f'decorated prompt: {repr(prompt)}')
-        input_ids = self.tokenizer.encode(
-            prompt,
-            add_special_tokens=True,
-            return_tensors='pt',
-        )
-        self.prev_round += 1
-        return input_ids
--- a/lmdeploy/pytorch/chat.py
+++ b/lmdeploy/pytorch/chat.py
 # Copyright (c) OpenMMLab. All rights reserved.
-"""Chat through command line.
+import os
-This submodule allows user to chat with language model through command line,
+import random
-and optionally accelerate model using backends like deepspeed.
+from typing import List
-Example 1: Chat with default setting
+from lmdeploy.messages import EngineGenerationConfig, PytorchEngineConfig
+from lmdeploy.model import MODELS, best_match_model
-```python
+from lmdeploy.tokenizer import DetokenizeState, Tokenizer
-python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
-```
+os.environ['TM_LOG_LEVEL'] = 'ERROR'
-Example 2: Disable sampling
+def input_prompt(model_name):
-```python
+    """Input a prompt in the consolo interface."""
-python -m lmdeploy.pytorch.chat \
+    if model_name == 'codellama':
-    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
+        print('\nenter !! to end the input >>>\n', end='')
-    --temperature 0
+        sentinel = '!!'
-```
+    else:
+        print('\ndouble enter to end input >>> ', end='')
-Example 3: Accelerate with deepspeed inference
+        sentinel = ''  # ends when this string is seen
+    return '\n'.join(iter(input, sentinel))
-```python
-python -m lmdeploy.pytorch.chat \
-    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
+def valid_str(string, coding='utf-8'):
-    --accel deepspeed
+    """decode text according to its encoding type."""
-```
+    invalid_chars = [b'\xef\xbf\xbd']
+    bstr = bytes(string, coding)
-Note: to use deepspeed, you need to install deepspeed,
+    for invalid_char in invalid_chars:
-    and if hope to accelerate InternLM, you need a customized version
+        bstr = bstr.replace(invalid_char, b'')
-    https://github.com/wangruohui/DeepSpeed/tree/support_internlm_0.10.0
+    ret = bstr.decode(encoding=coding, errors='ignore')
+    return ret
-Example 4: Tensor parallel the model on 2 GPUs
-```python
+def _stop_words(stop_words: List[str], tokenizer: Tokenizer):
-deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
+    """Return a list of token ids corresponding to stop-words."""
-    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
+    if stop_words is None:
-    --accel deepspeed \
+        return None
-```
+    assert isinstance(stop_words, List) and \
+        all(isinstance(elem, str) for elem in stop_words), \
-This module also allow the following control commands to change
+        f'stop_words must be a list but got {type(stop_words)}'
-generation behaviors during chat.
+    stop_words = [
+        tokenizer.encode(stop_word, False)[-1] for stop_word in stop_words
- `exit`: terminate and exit chat
+    ]
- `config set key=value`: change generation config `key` to `value`,
+    assert isinstance(stop_words, List) and all(
-    e.g. config temperature=0 disable sampling for following chats
+        isinstance(elem, int) for elem in stop_words), 'invalid stop_words'
- `clear`: clear chat history
+    return stop_words
-"""
-import itertools
+def run_chat(model_path: str,
-import logging
+             engine_config: PytorchEngineConfig,
-from typing import Optional
+             gen_config: EngineGenerationConfig = None,
+             session_id: int = 1,
-import torch
+             trust_remote_code: bool = True):
-from transformers import GenerationConfig, PreTrainedModel
+    """An example to perform model inference through the command line
+    interface.
-from .adapters import init_adapter
-from .dist import get_local_rank, get_rank, get_world_size
-from .model import accel_model, init_model
-from .session import BasicSessionManagerWithHistory
-from .utils import BasicStreamer, TerminalIO, control
-logger = logging.getLogger(__name__)
-def set_logging(log_file: str, debug: bool):
-    torch.set_printoptions(linewidth=120)
-    level = logging.DEBUG if debug else logging.INFO
-    log_file = log_file or 'chat.log'
-    if r := get_rank() != 0:
-        log_file = log_file + f'.{r}'
-    logging.basicConfig(level=level,
-                        format=('%(filename)s: '
-                                '%(levelname)s: '
-                                '%(funcName)s(): '
-                                '%(lineno)d:\t'
-                                '%(message)s'),
-                        filename=log_file,
-                        filemode='w')
-    print(f'Worker {get_rank()} logging to {log_file}')
-def main(
-    model_path: str,
-    tokenizer_path: Optional[str] = None,
-    accel: Optional[str] = None,
-    max_new_tokens: int = 128,
-    temperature: float = 0.8,
-    top_p: float = 0.95,
-    seed: int = 0,
-    use_fast_tokenizer: bool = True,
-    max_alloc: int = 2048,
-    max_session_len: int = None,
-    log_file: Optional[str] = None,
-    debug: bool = False,
-    adapter: Optional[str] = None,
-):
-    """Chat with model through terminal.
    Args:
-        model_path (str): Path to model.
+        model_path (str): the huggingface model path.
-        tokenizer_path (str): Path to tokenizer.
+        engine_config (PytorchEngineConfig): Config of engine.
-        accel (str): Model accelerator.
+        gen_config (EngineGenerationConfig): Config of generation.
-        max_new_tokens (int): Maximum number of tokens to generate.
+        session_id (int): the identical id of a session.
-        temperature (float): Temperature for sampling.
+        trust_remote_code (bool): trust remote code.
-        top_p (float): Top p for sampling.
+    """
-        seed (int): Random seed.
+    from lmdeploy.pytorch.engine import Engine
-        use_fast_tokenizer (bool): Whether to use fast tokenizer.
+    tm_model = Engine.from_pretrained(model_path,
-            This argument is directly pass to transformer's ``AutoTokenizer.from_pretrained``.
+                                      engine_config=engine_config,
-            Generally, user should choose to use fast tokenizers.
+                                      trust_remote_code=trust_remote_code)
-            But if using fast raise some error, try to force using a slow one.
+    tokenizer = tm_model.tokenizer
-        max_alloc (int): Maximum memory to allocate (for deepspeed).
+    generator = tm_model.create_instance()
-        max_session_len (int): Maximum number of tokens allowed for all chat sessions.
+    adapter_name = None
-            This include both history and current session.
+    if engine_config.adapters is not None:
-        log_file (str): Path to log file.
+        adapter_name = next(iter(engine_config.adapters.keys()))
-        debug (bool): Whether to enable debug mode.
-        adapter (str): Force to use an adapter.
+    if gen_config is None:
-            Generally user should not use this argument because adapter is selected based
+        gen_config = EngineGenerationConfig()
-            on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
-            based on `LlamaforCausalLM` class, this argument is required.
+    nth_round = 1
-            Currently, only "llama1" is acceptable for llama1 models.
+    step = 0
-    """  # noqa: E501
+    seed = random.getrandbits(64)
-    set_logging(log_file, debug)
+    model_name = engine_config.model_name
+    if model_name is None:
-    # workers should sync in sampling
+        model_name = best_match_model(model_path)
-    torch.manual_seed(seed)
+        assert model_name is not None, 'Can not find match model template'
+        print(f'match template: <{model_name}>')
-    local_rank = get_local_rank()
+    model = MODELS.get(model_name)()
-    world_size = get_world_size()
+    stop_words = _stop_words(model.stop_words, tokenizer)
-    # Init model and tokenizer
+    while True:
-    if not tokenizer_path:
+        prompt = input_prompt(model_name)
-        tokenizer_path = model_path
+        if prompt == 'exit':
+            exit(0)
-    model, tokenizer = init_model(
+        elif prompt == 'end':
-        model_path,
+            generator.end(session_id)
-        tokenizer_path,
+            nth_round = 1
-        use_fast_tokenizer=use_fast_tokenizer,
+            step = 0
-    )
+            seed = random.getrandbits(64)
+        else:
-    # Init adapter based on model and tokenizer
+            prompt = model.get_prompt(prompt, nth_round == 1)
-    adapter = init_adapter(model, tokenizer, adapter)
+            input_ids = tokenizer.encode(prompt, nth_round == 1)
+            session_len = model.session_len
-    # Accelerate model
+            if session_len is None:
-    model: PreTrainedModel = accel_model(model,
+                session_len = tm_model.session_len
-                                         accel,
+            if step >= session_len:
-                                         max_alloc=max_alloc,
+                print('WARNING: exceed session max length.'
-                                         tp_size=world_size)
+                      ' Please end the session.')
+                continue
-    # warmup
-    warmup_config = GenerationConfig(
+            print(f'{prompt} ', end='', flush=True)
-        max_new_tokens=1,
+            state = DetokenizeState()
-        do_sample=temperature > 0,
+            gen_config.random_seed = seed
-        temperature=temperature,
+            gen_config.stop_words = stop_words
-        top_p=top_p,
+            for outputs in generator.stream_infer(session_id=session_id,
-    )
+                                                  input_ids=input_ids,
-    model.generate(torch.tensor([[6]], device=get_local_rank()), warmup_config)
+                                                  gen_config=gen_config,
+                                                  adapter_name=adapter_name):
-    gen_config = GenerationConfig(
+                status, res, tokens = outputs
-        max_new_tokens=max_new_tokens,
+                # decode res
-        do_sample=temperature > 0,
+                response, state = tokenizer.detokenize_incrementally(
-        temperature=temperature,
+                    res, state)
-        top_p=top_p,
+                response = valid_str(response)
-    )
+                print(f'{response}', end='', flush=True)
-    # Session manager handling history
+            # update step
-    max_session_len = max_alloc if max_session_len is None else max_session_len
+            step += len(input_ids) + tokens
-    sm = BasicSessionManagerWithHistory(max_session_len=max_session_len,
+            print()
-                                        start_ids=adapter.start_ids,
-                                        sep_ids=adapter.sep_ids)
+            nth_round += 1
-    io = TerminalIO()
-    streamer = BasicStreamer(adapter.decode, io.output)
+def main(model_path: str,
-    for r in itertools.count(1):
+         model_name: str = None,
-        # User input from IO
+         session_id: int = 1,
-        logger.info(f'Round {r}')
+         top_k: float = 40,
+         top_p: float = 0.8,
-        prompt: str = io.input()
+         temperature: float = 0.8,
-        logger.info(f'User input: {prompt}')
+         repetition_penalty: float = 1.0,
+         tp: int = 1,
-        # Allow user to change config during runtime or exit
+         stream_output: bool = True,
-        if control(prompt, gen_config, sm):
+         adapter: str = None,
-            continue
+         trust_remote_code: bool = True):
+    """An example to perform model inference through the command line
-        # Tokenize and apply model specific templates
+    interface.
-        input_ids = adapter.encode_and_decorate(prompt)
-        logger.info(f'Input ids:\n{input_ids}')
-        # Prepend chat history (tensor concatenation)
-        input_ids = sm.prepend_history(input_ids)
-        logger.info(f'Input ids with history:\n{input_ids}')
-        # Generate
-        input_ids = input_ids.cuda(local_rank)
-        # returned tensor including input and generated output
-        output = model.generate(input_ids,
-                                gen_config,
-                                streamer=streamer,
-                                stopping_criteria=adapter.stopping_criteria)
-        logger.info(f'Output:\n{output}')
-        # Save output into session manager and maybe trim some history
-        sm.add_to_history(output)
-def cli():
-    import fire
-    fire.Fire(main)
+    Args:
+        model_path (str): the huggingface model path
+        model_name (str): name of the model.
+        session_id (int): the identical id of a session
+        top_k (int): sampling top k.
+        top_p (int): sampling top p.
+        temperature (float): sampling temperature.
+        repetition_penalty (float): parameter to penalize repetition
+        tp (int): GPU number used in tensor parallelism
+        stream_output (bool): indicator for streaming output or not
+        adapter (str): path to lora adapter.
+        trust_remote_code (bool): Trust remote code.
+    """
+    adapters = None
+    if adapter is not None:
+        adapters = dict(default=adapter)
+    engine_config = PytorchEngineConfig(model_name=model_name,
+                                        tp=tp,
+                                        adapters=adapters)
+    gen_config = EngineGenerationConfig(max_new_tokens=512,
+                                        top_k=top_k,
+                                        top_p=top_p,
+                                        temperature=temperature,
+                                        repetition_penalty=repetition_penalty,
+                                        ignore_eos=False)
+    return run_chat(model_path,
+                    engine_config,
+                    gen_config,
+                    session_id=session_id,
+                    trust_remote_code=trust_remote_code)
 if __name__ == '__main__':
-    cli()
+    import fire
+    fire.Fire(main)
--- a/lmdeploy/pytorch/decode.py
+++ b/lmdeploy/pytorch/decode.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import logging
-import queue
-import warnings
-from typing import List, Optional
-import pynvml
-import torch
-import torch.multiprocessing as mp
-from torch.nn.utils.rnn import pad_sequence
-from transformers import (AutoTokenizer, PreTrainedModel,
-                          PreTrainedTokenizerBase)
-from .model import accel_model, init_model
-def safe_numel(free_mem, model_size, max_intermediate):
-    """Number of elements without out-of-memory."""
-    return int(free_mem - model_size) // max_intermediate
-def avail_gpus(percentage=0.96):
-    """Detect available gpus.
-    Args:
-        percentage (float): The minimum percentage of free memory to be
-            considered as available.
-    Return:
-       A list of gpu ids.
-       average free memory on single gpu.
-    """
-    gpus = []
-    mems = []
-    pynvml.nvmlInit()
-    for i in range(torch.cuda.device_count()):
-        handle = pynvml.nvmlDeviceGetHandleByIndex(int(i))
-        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        free, total = int(mem_info.free), int(mem_info.total)
-        if free / total > percentage:
-            gpus.append(i)
-            mems.append(free)
-    pynvml.nvmlShutdown()
-    if len(gpus) == 0:
-        raise RuntimeError('No GPU available.')
-    return gpus, sum(mems) / len(mems)
-@torch.no_grad()
-def decode_single(model: PreTrainedModel,
-                  input_ids: torch.Tensor,
-                  attention_mask: torch.Tensor = None,
-                  return_logits=True):
-    """Decode a single batch.
-    Args:
-        model (PreTrainedModel): Pretrained model.
-        input_ids (torch.Tensor): A batch of input ids.
-        attention_mask (torch.Tensor): A batch of attention masks.
-    Returns:
-        torch.Tensor: A batch of probabilities (on CPU).
-    Note:
-        This function assume input_ids[i] = [bos, x1, x2, ..., xn]
-        and return prob = [p(x1|bos), p(x2|bos,x1), ..., p(xn|bos..xn-1)]
-        So prob is shorter than input_ids by 1.
-    """
-    # Call Causal LM forward
-    outputs = model(input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    output_hidden_states=False,
-                    output_attentions=False,
-                    use_cache=False,
-                    return_dict=True)
-    # fp32, [bs, seq_len, vocab_size]
-    logits = outputs.logits
-    if not return_logits:
-        # inplace softmax to get probs
-        torch.softmax(logits, dim=-1, out=logits)
-        # Shift to fetch probabilities
-        shift_labels = input_ids[..., 1:].contiguous()
-        shift_probs = logits[..., :-1, :].contiguous()
-        logits = torch.gather(shift_probs, -1, shift_labels.unsqueeze(-1))
-    if attention_mask is not None:
-        logits *= attention_mask[..., None]
-    logits = logits.cpu()
-    return logits
-def worker_fn(model_path: str,
-              inq: mp.Queue,
-              outq: mp.Queue,
-              accel: Optional[str] = None,
-              gpu_id=0):
-    # torch.set_default_device(gpu_id)
-    model, _ = init_model(model_path)
-    model = model.eval()
-    model = accel_model(model, accel, gpu_id=gpu_id)
-    while True:
-        try:
-            idx, args = inq.get(timeout=1)
-        except queue.Empty:
-            continue
-        if idx is None:
-            print(f'Worker {gpu_id} received exit signal.')
-            break
-        # print(args)
-        input_ids, input_lens, *args = args
-        input_ids = input_ids.cuda(gpu_id)
-        max_len = max(input_lens)
-        assert max_len == input_ids.size(-1), \
-            f'input_ids.shape = {input_ids.shape}, max_len = {max_len}'
-        input_lens = torch.tensor(input_lens, device=gpu_id)
-        attention_mask = \
-            torch.arange(max_len, device=gpu_id)[None, :] < input_lens[:, None]
-        assert attention_mask.shape == input_ids.shape, \
-            f'attention_mask.shape = {attention_mask.shape}'
-        try:
-            probs = decode_single(model, input_ids, attention_mask, *args)
-        except torch.cuda.OutOfMemoryError:
-            warnings.warn(
-                f'OOM on GPU {gpu_id}, discard prompts at indics {idx}.')
-            probs = torch.empty((input_ids.size(0), 0),
-                                dtype=torch.float32,
-                                device='cpu')
-        outq.put((idx, probs))
-    print(f'Exiting worker {gpu_id} ...')
-    inq.close()
-    outq.close()
-    print(f'Worker {gpu_id} finished.')
-class Engine:
-    """Multi-GPU deciding engine.
-    Args:
-        model_path (str): Path to the pretrained model.
-        tokenizer_path (str, optional): Path to the pretrained tokenizer.
-            Defaults to None.
-            Either tokenizer_path or tokenizer should be provided.
-        tokenizer (PreTrainedTokenizerBase, optional): Pre-configured tokenizer.
-            Defaults to None.
-            Either tokenizer_path or tokenizer should be provided.
-        accel (str, optional): Acceleration method.
-            Defaults to None. 'deepspeed' is not tested.
-        gpu_mem_percentage (float, optional): GPU with memory larger than this value
-            are considered available and be used as decode device.
-            Defaults to 0.96.
-        model_size_byte (float, optional): (Approximate) model size in bytes.
-            Defaults to 14e9 (7B model in FP16).
-        bytes_per_token (float, optional): (Approximate) memory cost per token in bytes.
-            Defaults to 2e6 (2MB).
-            ``bytes_per_token`` and ``model_size_byte`` are used to compute
-            the maximum batch size for given seq_length
-    """  # noqa: E501
-    def __init__(self,
-                 model_path: str,
-                 tokenizer_path: Optional[str] = None,
-                 tokenizer: Optional[PreTrainedTokenizerBase] = None,
-                 accel: Optional[str] = None,
-                 gpu_mem_percentage: float = 0.96,
-                 model_size_byte=14e9,
-                 bytes_per_token=2e6):
-        gpu_ids, mem = avail_gpus(gpu_mem_percentage)
-        print(f'Available GPUs are: {gpu_ids}, ', end='')
-        print(f'with {mem/2**30:.2f} GiB free.')
-        ctx = mp.get_context('spawn')
-        inq = ctx.Queue()
-        outq = ctx.Queue()
-        ps = []
-        for id in gpu_ids:
-            p = ctx.Process(target=worker_fn,
-                            args=(model_path, inq, outq, accel, id))
-            p.start()
-            ps.append(p)
-        if tokenizer is None:
-            if tokenizer_path is None:
-                tokenizer_path = model_path
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-        self.gpu_ids = gpu_ids
-        self.inq = inq
-        self.outq = outq
-        self.ps = ps
-        self.tokenizer = tokenizer
-        self.safe_numel = safe_numel(mem, model_size_byte, bytes_per_token)
-    def clear_queue(self):
-        for q in self.inq, self.outq:
-            while not q.empty():
-                q.get()
-    def decode(self,
-               token_ids: List[List[int]],
-               sort=True,
-               max_bs: int = 1024,
-               pad=True,
-               pad_token_id=2,
-               return_logits=True):
-        """Inference the model to compute probabilities.
-        Args:
-            token_ids (List[List[int]]): List of list of token ids.
-            sort (bool, optional): Internally sort the prompts by length to achieve better efficiency.
-                Defaults to True.
-                Note: orders of returned probabilities are always the same as the input.
-            max_bs (int, optional): Maximum batch size.
-                Defaults to 1024.
-            pad (bool, optional): Pad the prompts in every mini batch to the same length.
-                Defaults to True. Set to False to save memory.
-            return_logits (bool, optional): Return logits instead of probabilities.
-        Returns:
-            numpy.ndarray: Array of logits of shape [bsz, seqlen, vocab_size],
-                with prob=0 padded, if pad is True
-            List[numpy.ndarray]: List of logits without padding, if pad is False.
-        Note:
-            This function will accept input token_ids = [x0(=bos), x1, x2, ..., xn]
-            and compute prob = [p(x1|x0), p(x2|x0,x1), ..., p(xn|x0..xn-1)]
-            So prob is shorter than input_ids by 1.
-        """  # noqa: E501
-        self.clear_queue()
-        # sort to achieve better efficiency
-        if sort:
-            pids_and_indicis = sorted(enumerate(token_ids),
-                                      key=lambda i_and_x: len(i_and_x[1]))
-        else:
-            pids_and_indicis = list(enumerate(token_ids))
-        left = 0
-        bs = max_bs
-        while left < len(token_ids):
-            if not sort:
-                bs = max_bs
-            right = min(left + bs, len(token_ids))
-            # batch of prompts
-            sub_p_and_i = pids_and_indicis[left:right]
-            idx, sub_p = zip(*sub_p_and_i)
-            # batch of input_ids and attn_masks
-            # inputs = self.tokenizer(sub_p, return_tensors='pt', padding=True)
-            input_ids = [torch.tensor(p) for p in sub_p]
-            input_ids = pad_sequence(input_ids,
-                                     batch_first=True,
-                                     padding_value=pad_token_id)
-            input_lens = [len(p) for p in sub_p]
-            # Dynamic batch size based on safe memory
-            while input_ids.numel() > self.safe_numel:
-                if bs == 1:
-                    break
-                bs = max(1, round(bs / 1.5))
-                print(f'\nReduce bs to {bs} when seq len reaches '
-                      f'{input_ids.shape[-1]}')
-                idx = idx[:bs]
-                input_lens = input_lens[:bs]
-                input_ids = input_ids[:bs, :max(input_lens)]
-            # Send to worker
-            self.inq.put((idx, (input_ids, input_lens)))
-            left += bs
-            print(
-                f'Distributing prompts {right}/{len(token_ids)},'
-                f' {right/len(token_ids):.0%}',
-                end='\r')
-        print()
-        # Collect outputs from workers
-        all_probs = [None] * len(token_ids)
-        count = 0
-        while count < len(token_ids):
-            idx, probs = self.outq.get()
-            for i, p in zip(idx, probs):
-                assert all_probs[i] is None
-                all_probs[i] = p
-            count += len(idx)
-            print(
-                f'Decoding and collecting outputs '
-                f'{count}/{len(token_ids)}, '
-                f'{count/len(token_ids):.0%}',
-                end='\r')
-        print()
-        if pad:
-            all_probs = pad_sequence(all_probs, batch_first=True)
-            all_probs = all_probs.cpu().numpy()
-        else:
-            all_probs = [p.cpu().numpy() for p in all_probs]
-        return all_probs
-    def __del__(self):
-        print('Exiting engine ...')
-        for _ in self.ps:
-            self.inq.put((None, None))
-        for p in self.ps:
-            p.join(timeout=1)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_path',
-                        default='llama2/huggingface/llama-2-7b',
-                        help='Path to HugigngFace model and tokenizer.')
-    parser.add_argument(
-        '--test_path',
-        default='',
-        help='Path to text file, with each line containing a prompt.')
-    parser.add_argument(
-        '-p',
-        '--prompts',
-        nargs='*',
-        default=[
-            'I believe the meaning of life is to find your gift.',
-            'Simply put, the theory of relativity states that',
-            'Building a website can be done in 10 simple steps:'
-        ],
-        help="Prompt in command line, please quote \"\" every sentences, "
-        'surpassed by --test_path')
-    parser.add_argument('--min_len',
-                        default=1,
-                        help='Minimum length of prompts')
-    parser.add_argument('--save-to',
-                        default='decode.out',
-                        help='Save results to this file.')
-    args = parser.parse_args()
-    model_path = args.model_path
-    test_path = args.test_path
-    prompts = args.prompts
-    logger = logging.getLogger(__name__)
-    # logging.basicConfig(level=logging.DEBUG)
-    # Use test file preferentially
-    if test_path:
-        with open(test_path, 'r') as f:
-            prompts = f.readlines()
-    prompts = [p.strip() for p in prompts]
-    # Output infos
-    print(f'Model path: {model_path}')
-    def _format(ts, start, end):
-        if start < 0:
-            start += len(ts)
-        if end <= 0:
-            end += len(ts)
-        return '\n'.join(
-            (f'{i}\t{t}' for i, t in zip(range(start, end), ts[start:end])))
-    if len(prompts) > 10:
-        print('Prompts:\n' + _format(prompts, 0, 5) + '\n......\n' +
-              _format(prompts, -5, 0))
-    else:
-        print('Prompts:\n' + _format(prompts, 0, 0))
-    # Init Engine in backend
-    engine = Engine(model_path)
-    # Tokenize
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    tokenizer.pad_token_id = tokenizer.eos_token_id
-    tokenizer.padding_side = 'right'
-    input_ids = tokenizer(prompts, padding=False)
-    input_ids: List[List[int]] = input_ids.input_ids
-    # Filter out too short prompts
-    input_ids = [i for i in input_ids if len(i) >= args.min_len]
-    if len(input_ids) < len(prompts):
-        logger.warning(
-            f'Filtered out {len(prompts) - len(input_ids)} prompts, '
-            f'because they are shorter than {args.min_len}.')
-    # Decode
-    logits = engine.decode(input_ids)
-    print(f'logits.shape = {logits.shape}')
-    # Save to pth
-    print(f'Dumping results to = {args.save_to}')
-    torch.save(logits, args.save_to, pickle_protocol=4)
-    del engine