同步0.2.6代码

d7117b95 · zhouxiang · 5f83e392 · d7117b95 · d7117b95 · d7117b95
Commit d7117b95 authored Mar 22, 2024 by zhouxiang
20 changed files
--- a/lmdeploy/turbomind/__init__.py
+++ b/lmdeploy/turbomind/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from .turbomind import TurboMind
+
+
+def bootstrap():
+    import os
+    import sys
+
+    has_turbomind = False
+    pwd = os.path.dirname(__file__)
+    if os.path.exists(os.path.join(pwd, 'lib')):
+        has_turbomind = True
+    if os.name == 'nt' and has_turbomind:
+        if sys.version_info[:2] >= (3, 8):
+            CUDA_PATH = os.getenv('CUDA_PATH')
+            assert CUDA_PATH is not None, 'Can not find $env:CUDA_PATH'
+            dll_path = os.path.join(CUDA_PATH, 'bin')
+            print('Add dll path {dll_path}, please note cuda version '
+                  'should >= 11.3 when compiled with cuda 11')
+            os.add_dll_directory(dll_path)
+
+
+bootstrap()
+
+from .turbomind import TurboMind  # noqa: E402

 __all__ = ['TurboMind']
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
 # Copyright (c) OpenMMLab. All rights reserved.
-import dataclasses
 import os
 import random

-from lmdeploy.turbomind.utils import get_gen_param
+from lmdeploy.messages import EngineGenerationConfig
+from lmdeploy.model import ChatTemplateConfig
+from lmdeploy.tokenizer import DetokenizeState

 os.environ['TM_LOG_LEVEL'] = 'ERROR'

@@ -29,32 +30,51 @@ def valid_str(string, coding='utf-8'):
    return ret


-def main(model_path,
+def main(model_path: str,
+         model_name: str = None,
         session_id: int = 1,
         cap: str = 'chat',
         tp: int = 1,
         stream_output: bool = True,
-         request_output_len: int = 512,
+         request_output_len: int = 1024,
+         chat_template_cfg: ChatTemplateConfig = None,
         **kwargs):
    """An example to perform model inference through the command line
    interface.

    Args:
        model_path (str): the path of the deployed model
+        model_name (str): the name of deployed model
        session_id (int): the identical id of a session
        cap (str): the capability of a model. For example, codellama has
            the ability among ['completion', 'infilling', 'chat', 'python']
        tp (int): GPU number used in tensor parallelism
        stream_output (bool): indicator for streaming output or not
+        request_output_len (int): output token nums
+        chat_template_cfg (ChatTemplateConfig): Chat template config
        **kwarg (dict): other arguments for initializing model's chat template
    """
    from lmdeploy import turbomind as tm
-    tm_model = tm.TurboMind.from_pretrained(model_path,
-                                            tp=tp,
-                                            capability=cap,
-                                            **kwargs)
+    if chat_template_cfg is None:
+        chat_template_cfg = ChatTemplateConfig(model_name=model_name,
+                                               capability=cap)
+        new_kwargs = {}
+        for k, v in kwargs.items():
+            if hasattr(chat_template_cfg, k):
+                setattr(chat_template_cfg, k, v)
+            else:
+                new_kwargs[k] = v
+        kwargs = new_kwargs
+    tm_model = tm.TurboMind.from_pretrained(
+        model_path,
+        model_name=model_name,
+        tp=tp,
+        capability=cap,
+        chat_template_config=chat_template_cfg,
+        **kwargs)
    tokenizer = tm_model.tokenizer
    generator = tm_model.create_instance()
+    gen_config = EngineGenerationConfig(top_k=40)

    nth_round = 1
    step = 0
@@ -90,29 +110,30 @@ def main(model_path,
                      ' Please end the session.')
                continue

-            gen_param = get_gen_param(cap, model.sampling_param, nth_round,
-                                      step, request_output_len, **kwargs)
+            sequence_start = (nth_round == 1)
+            sequence_end = False
+            if cap != 'chat':  # not interactive for other capability
+                sequence_start, sequence_end = True, True
+                step = 0

            print(f'{prompt} ', end='', flush=True)
-            response_size = 0
+            state = DetokenizeState()
            for outputs in generator.stream_infer(
                    session_id=session_id,
                    input_ids=[input_ids],
+                    sequence_start=sequence_start,
+                    sequence_end=sequence_end,
+                    step=step,
                    stream_output=stream_output,
-                    **dataclasses.asdict(gen_param),
+                    gen_config=gen_config,
                    ignore_eos=False,
                    random_seed=seed if nth_round == 1 else None):
-                res, tokens = outputs[0]
+                _, res, tokens = outputs
                # decode res
-                response = tokenizer.decode(res.tolist(), offset=response_size)
-                # utf-8 char at the end means it's a potential unfinished
-                # byte sequence, continue to concate it with the next
-                # sequence and decode them together
-                if response.endswith('�'):
-                    continue
+                response, state = tokenizer.detokenize_incrementally(
+                    res, state=state)
                response = valid_str(response)
                print(f'{response}', end='', flush=True)
-                response_size = tokens

            # update step
            step += len(input_ids) + tokens

--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
@@ -7,10 +7,9 @@ from pathlib import Path

 import fire
 import torch
-from huggingface_hub import snapshot_download

 from lmdeploy.model import MODELS
-from lmdeploy.turbomind.utils import create_hf_download_args
+from lmdeploy.utils import get_model

 from .source_model.base import INPUT_MODELS
 from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig
@@ -19,7 +18,8 @@ supported_formats = ['llama', 'hf', 'awq', None]
 special_input_model_map = {
    'qwen': 'qwen',
    'baichuan': 'baichuan',
-    'baichuan2': 'baichuan2'
+    'baichuan2': 'baichuan2',
+    'internlm2': 'internlm2'
 }


@@ -241,8 +241,7 @@ def main(model_name: str,
    if not os.path.exists(model_path):
        print(f'can\'t find model from local_path {model_path}, '
              'try to download from huggingface')
-        download_kwargs = create_hf_download_args(**kwargs)
-        model_path = snapshot_download(model_path, **download_kwargs)
+        model_path = get_model(model_path)
        print(f'load model from {model_path}')

    # get tokenizer path

--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
 from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel  # noqa: F401
+from .internlm2 import InternLM2AwqModel, InternLM2Model  # noqa: F401
 from .llama import LlamaModel  # noqa: F401
 from .llama_awq import LlamaAwqModel  # noqa: F401
 from .meta_llama import MetaLlamaModel  # noqa: F401

--- a/lmdeploy/turbomind/deploy/source_model/baichuan.py
+++ b/lmdeploy/turbomind/deploy/source_model/baichuan.py
@@ -9,8 +9,9 @@ from .llama import LlamaModel, LlamaReader
 class BaichuanReader(LlamaReader):
    """BaichuanReader."""

-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
-        super().__init__(new_params, unused_params, last_bin)
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
+        super().__init__(new_params, unused_params, last_bin, model_cfg)

    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
        """Get q, k, v, o kind for layer i."""
@@ -34,8 +35,9 @@ class BaichuanReader(LlamaReader):
 class Baichuan2Reader(BaichuanReader):
    """Baichuan2Reader."""

-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
-        super().__init__(new_params, unused_params, last_bin)
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
+        super().__init__(new_params, unused_params, last_bin, model_cfg)

    def output_weight(self):
        """Get output."""

--- a/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
+++ b/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
@@ -9,8 +9,9 @@ from .llama_awq import ensure_fp16orint32
 class BaichuanAwqReader(BaichuanReader):
    """BaichuanAwqReader."""

-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
-        super().__init__(new_params, unused_params, last_bin)
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
+        super().__init__(new_params, unused_params, last_bin, model_cfg)

    def attn(self, i: int):
        """Get q, k, v, o qweight for layer i."""
@@ -40,8 +41,9 @@ class BaichuanAwqReader(BaichuanReader):
 class Baichuan2AwqReader(BaichuanAwqReader):
    """Baichuan2AwqReader."""

-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
-        super().__init__(new_params, unused_params, last_bin)
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
+        super().__init__(new_params, unused_params, last_bin, model_cfg)

    def output_weight(self):
        """Get output."""

--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -2,6 +2,7 @@
 import json
 import os
 import os.path as osp
+from glob import glob

 import torch
 from safetensors.torch import load_file
@@ -19,11 +20,13 @@ class LlamaReader(BaseReader):
    norm_weight_key = 'model.norm.weight'
    output_weight_key = 'lm_head.weight'

-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
        super().__init__()
        self.params = unused_params
        self.params.update(new_params)
        self.last_bin = last_bin
+        self.model_cfg = model_cfg
        self.init_layer_id()

    def init_layer_id(self):
@@ -128,13 +131,11 @@ class LlamaModel(BaseInputModel):

    def get_ckpt(self):
        """Get weight files."""
-        suffixes = ['.safetensors', '.bin']
+        patterns = ['*.safetensors', 'pytorch_model*.bin']
        files = []
-        for suffix in suffixes:
-            files = [
-                file for file in os.listdir(self.ckpt_path)
-                if file.endswith(suffix)
-            ]
+        for pattern in patterns:
+            files = glob(os.path.join(self.ckpt_path, pattern))
+            files = [os.path.basename(file) for file in files]
            if len(files) > 0:
                break
        files = sorted(files)
@@ -159,7 +160,7 @@ class LlamaModel(BaseInputModel):
                else:
                    new_params = load_file(osp.join(self.ckpt_path, ckpt))
                ret = self.Reader(new_params, unused_params,
-                                  i == self.nmgrs - 1)
+                                  i == self.nmgrs - 1, self.model_info())
                yield ret
                ret.clean_up(is_last_bin)
        except GeneratorExit:
@@ -181,6 +182,7 @@ class LlamaModel(BaseInputModel):
            model_arg = json.load(f)
            num_layer = model_arg['num_hidden_layers']
            norm_eps = model_arg['rms_norm_eps']
+            attn_head_num = model_arg['num_attention_heads']
            if 'num_key_value_heads' in model_arg:
                kv_head_num = model_arg['num_key_value_heads']
            else:
@@ -192,6 +194,7 @@ class LlamaModel(BaseInputModel):

        return dict(num_layer=num_layer,
                    norm_eps=norm_eps,
+                    attn_head_num=attn_head_num,
                    kv_head_num=kv_head_num,
                    rope_theta=rope_theta,
                    max_position_embeddings=max_position_embeddings,

--- a/lmdeploy/turbomind/deploy/source_model/llama_awq.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama_awq.py
@@ -23,8 +23,9 @@ def ensure_fp16orint32(tensors: torch.Tensor):
 class LlamaAwqReader(LlamaReader):
    """LlamaAwqReader."""

-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
-        super().__init__(new_params, unused_params, last_bin)
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
+        super().__init__(new_params, unused_params, last_bin, model_cfg)

    def attn(self, i: int):
        """Get q, k, v, o qweight for layer i."""

--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -16,8 +16,9 @@ class QwenReader(LlamaReader):
    norm_weight_key = 'transformer.ln_f.weight'
    output_weight_key = 'lm_head.weight'

-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
-        super().__init__(new_params, unused_params, last_bin)
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
+        super().__init__(new_params, unused_params, last_bin, model_cfg)

    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
        """Get q, k, v, o kind for layer i."""

--- a/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
@@ -7,8 +7,9 @@ from .qwen import QwenModel, QwenReader
 class QwenAwqReader(QwenReader):
    """QwenAwqReader."""

-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
-        super().__init__(new_params, unused_params, last_bin)
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
+        super().__init__(new_params, unused_params, last_bin, model_cfg)

    def attn(self, i: int):
        """Get q, k, v, o qweight for layer i."""

--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import configparser
+import copy
 import inspect
+import io
+import json
 import os.path as osp
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
+from configparser import ConfigParser

 import torch
 import tqdm
 from mmengine import Registry
+from pydantic.dataclasses import dataclass

+from lmdeploy.messages import TurbomindEngineConfig
 from lmdeploy.model import MODELS

 from ..source_model.base import BaseInputModel, BaseReader
@@ -30,18 +35,18 @@ def tprint(*args, **kwargs):
 @dataclass
 class TurbomindModelConfig:
    """Config for turbomind model."""
-    model_name: str
-    tensor_para_size: int
-    head_num: int
-    kv_head_num: int
-    vocab_size: int
-    num_layer: int
-    inter_size: int
-    norm_eps: float
-    attn_bias: int
-    start_id: int
-    end_id: int
-    session_len: int
+    model_name: str = None
+    tensor_para_size: int = None
+    head_num: int = None
+    kv_head_num: int = None
+    vocab_size: int = None
+    num_layer: int = None
+    inter_size: int = None
+    norm_eps: float = None
+    attn_bias: int = None
+    start_id: int = None
+    end_id: int = None
+    session_len: int = None
    weight_type: str = 'fp16'
    rotary_embedding: int = 128
    rope_theta: float = 10000.0
@@ -50,9 +55,12 @@ class TurbomindModelConfig:
    max_batch_size: int = 64
    max_context_token_num: int = 1
    step_length: int = 1
-    cache_max_entry_count: float = 0.5
+    cache_max_entry_count: float = 0.8
    cache_block_seq_len: int = 128
-    cache_chunk_size: int = 1
+    cache_chunk_size: int = -1
+    num_tokens_per_iter: int = 0
+    max_prefill_iters: int = 1
+    extra_tokens_per_iter: int = 0
    use_context_fmha: int = 1
    quant_policy: int = 0
    max_position_embeddings: int = 0
@@ -74,6 +82,34 @@ class TurbomindModelConfig:
            default.update(used)
            return cls(**default)

+    @classmethod
+    def from_engine_config(cls, config: TurbomindEngineConfig):
+        env = copy.deepcopy(config.__dict__)
+        env['tensor_para_size'] = env['tp']
+        ret = TurbomindModelConfig.from_dict(env, allow_none=True)
+        ret.rotary_embedding = ret.size_per_head
+        # workround to support `max_prefill_token_num` in turbomind engine
+        if config.max_prefill_token_num is not None and \
+                config.session_len is not None:
+            ret.num_tokens_per_iter = config.max_prefill_token_num
+            ret.max_prefill_iters = (config.session_len +
+                                     config.max_prefill_token_num -
+                                     1) // config.max_prefill_token_num
+        return ret
+
+    def toini(self):
+        config = copy.deepcopy(self.__dict__)
+        parser = ConfigParser()
+        parser['llama'] = config
+        with io.StringIO() as ss:
+            parser.write(ss)
+            ss.seek(0)
+            ini = ss.read()
+        return ini
+
+    def __str__(self):
+        return json.dumps(self.__dict__, indent=2)
+
    @property
    def valid(self):
        """Check if cfg is valid."""

--- a/lmdeploy/turbomind/generate_gemm_config.py
+++ b/lmdeploy/turbomind/generate_gemm_config.py
 # Copyright (c) OpenMMLab. All rights reserved.

+import os.path as osp
 import subprocess


 def get_llama_gemm():
+    """get the executable binary llama_gemm."""
    import os.path as osp

    import lmdeploy
@@ -13,12 +15,52 @@ def get_llama_gemm():
    return bin_path


+def read_config(ini_path: str):
+    """read turbomind config from turbomind.
+
+    Args:
+        ini_path (str): the path of `config.ini` file in turbomind model
+    """
+    from configparser import ConfigParser
+
+    from lmdeploy.turbomind.deploy.target_model.base import \
+        TurbomindModelConfig
+
+    with open(ini_path, 'r') as f:
+        parser = ConfigParser()
+        parser.read_file(f)
+    section_name = 'llama'
+    _cfg = parser._sections[section_name]
+    cfg = TurbomindModelConfig.from_dict(_cfg)
+    return cfg.head_num, cfg.size_per_head, cfg.inter_size, \
+        cfg.vocab_size, cfg.tensor_para_size
+
+
 def main(head_num: int = 32,
         size_per_head: int = 128,
         vocab_size: int = 32000,
         inter_size: int = 11008,
         tensor_para_size: int = 1,
-         max_batch_size: int = 64):
+         max_batch_size: int = 64,
+         model_path: str = None):
+    if model_path is not None:
+        from lmdeploy.turbomind.turbomind import get_model_source
+        from lmdeploy.turbomind.utils import ModelSource
+
+        model_source = get_model_source(model_path)
+        if model_source == ModelSource.WORKSPACE:
+            head_num, size_per_head, inter_size, vocab_size, \
+                tensor_para_size = read_config(
+                    osp.join(model_path,
+                             'triton_models', 'weights', 'config.ini'))
+        else:
+            from transformers import AutoConfig
+            config = AutoConfig.from_pretrained(model_path,
+                                                trust_remote_code=True)
+            head_num = config.num_attention_heads
+            size_per_head = config.hidden_size // head_num
+            inter_size = config.intermediate_size
+            vocab_size = config.vocab_size
    for bsz in range(1, max_batch_size + 1):
        subprocess.call(
            f'{get_llama_gemm()} {bsz} 1 1 {head_num} {size_per_head}'

--- a/lmdeploy/turbomind/hf_repo/config.json
+++ b/lmdeploy/turbomind/hf_repo/config.json
-{
-    "architectures": [
-        "LMDeployForCausalLM"
-    ],
-    "auto_map": {
-        "AutoConfig": "configuration_lmdeploy.LMDeployConfig",
-        "AutoModel": "modeling_lmdeploy.LMDeployForCausalLM",
-        "AutoModelForCausalLM": "modeling_lmdeploy.LMDeployForCausalLM"
-    },
-    "turbomind": {}
-}
--- a/lmdeploy/turbomind/hf_repo/configuration_lmdeploy.py
+++ b/lmdeploy/turbomind/hf_repo/configuration_lmdeploy.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-from transformers import PretrainedConfig
-
-from lmdeploy.turbomind.deploy.target_model.base import TurbomindModelConfig
-from lmdeploy.version import __version__ as lm_version
-
-
-class LMDeployConfig(PretrainedConfig):
-    """Lmdeploy config."""
-
-    def __init__(self, turbomind: dict = None, **kwargs):
-        default_tm_cfg = copy.deepcopy(
-            TurbomindModelConfig.from_dict({}, allow_none=True).__dict__)
-        if turbomind is not None:
-            default_tm_cfg.update(turbomind)
-        self.turbomind = default_tm_cfg
-        self.lmdeploy_version = lm_version
-        super().__init__(**kwargs)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
-        config, kwargs = super().from_pretrained(pretrained_model_name_or_path,
-                                                 return_unused_kwargs=True,
-                                                 **kwargs)
-        for k, v in kwargs.items():
-            if k in config.turbomind.keys():
-                config.turbomind[k] = v
-        if 'tp' in kwargs:
-            config.turbomind['tensor_para_size'] = kwargs['tp']
-        if return_unused_kwargs:
-            return config, kwargs
-        else:
-            return config
--- a/lmdeploy/turbomind/hf_repo/modeling_lmdeploy.py
+++ b/lmdeploy/turbomind/hf_repo/modeling_lmdeploy.py
-# Copyright (c) OpenMMLab. All rights reserved.
-import dataclasses
-import os
-from contextlib import contextmanager
-from dataclasses import dataclass, field
-from itertools import count
-from queue import Queue
-from typing import List, Optional, Tuple, Union
-
-from huggingface_hub import snapshot_download
-from transformers import PretrainedConfig
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-
-from lmdeploy.turbomind import TurboMind
-from lmdeploy.turbomind.utils import get_gen_param
-
-from .configuration_lmdeploy import LMDeployConfig
-
-logger = logging.get_logger(__name__)
-
-
-@dataclass
-class Session:
-    _count = count()
-    _session_id: int = None
-    _message: List[Tuple[str, str]] = field(default_factory=list)
-    _step: int = 0
-    _nth_round: int = 0
-    _error: int = 0
-
-    def __init__(self):
-        self._session_id = next(Session._count)
-        self._message = []
-        self._step = 0
-        self._nth_round = 0
-
-    @property
-    def session_id(self):
-        return self._session_id
-
-    @property
-    def message(self):
-        return self._message
-
-    @property
-    def step(self):
-        return self._step
-
-    @property
-    def nth_round(self):
-        return self._nth_round
-
-    @property
-    def error(self):
-        return self._error
-
-
-class LMDeployForCausalLM(PreTrainedModel):
-    config_class = LMDeployConfig
-
-    def __init__(self,
-                 config: LMDeployConfig,
-                 *inputs,
-                 model_path: str = None,
-                 **kwargs):
-        super().__init__(config)
-        self.tm_model = TurboMind.from_pretrained(model_path, **kwargs)
-        que = Queue()
-        for _ in range(config.turbomind['max_batch_size']):
-            que.put(self.tm_model.create_instance())
-        self.que = que
-
-    @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path,
-                        *model_args,
-                        config: Optional[Union[PretrainedConfig, str,
-                                               os.PathLike]] = None,
-                        cache_dir: Optional[Union[str, os.PathLike]] = None,
-                        force_download: bool = False,
-                        local_files_only: bool = False,
-                        token: Optional[Union[str, bool]] = None,
-                        revision: str = 'main',
-                        **kwargs):
-        """Instantiate a LM model with turbomind backend."""
-
-        resume_download = kwargs.pop('resume_download', True)
-        proxies = kwargs.pop('proxies', None)
-
-        if os.path.isdir(pretrained_model_name_or_path):
-            local_folder = pretrained_model_name_or_path
-        else:
-            local_folder = snapshot_download(
-                pretrained_model_name_or_path,
-                revision=revision,
-                cache_dir=cache_dir,
-                proxies=proxies,
-                resume_download=resume_download,
-                force_download=force_download,
-                token=token,
-                local_files_only=local_files_only,
-            )
-
-        if not isinstance(config, PretrainedConfig):
-            config_path = config if config is not None else local_folder
-            kwargs.pop('return_unused_kwargs')
-            config, model_kwargs = cls.config_class.from_pretrained(
-                config_path, return_unused_kwargs=True, **kwargs)
-        else:
-            model_kwargs = kwargs
-
-        model = cls(config,
-                    *model_args,
-                    model_path=local_folder,
-                    **model_kwargs)
-
-        generation_config = model.tm_model.model.sampling_param
-        for k, v in dataclasses.asdict(generation_config).items():
-            if hasattr(model.generation_config, k):
-                base_value = getattr(model.generation_config, k)
-                setattr(generation_config, k, base_value)
-            if k in kwargs:
-                setattr(generation_config, k, v)
-        model.generation_config = generation_config
-
-        return model
-
-    @contextmanager
-    def managed_generator(self, session: Session):
-        generator = self.que.get()
-        try:
-            yield generator
-        except:  # noqa E722
-            for _ in generator.stream_infer(session.session_id, [0],
-                                            request_output_len=0,
-                                            sequence_start=False,
-                                            sequence_end=False,
-                                            stop=True):
-                pass
-            session._error = 1
-        finally:
-            self.que.put(generator)
-
-    def generate(
-        self,
-        input_ids: List[int],
-        session: Session,
-        **kwargs,
-    ):
-        """Generates sequences of token ids for models with a language modeling
-        head.
-
-        Args:
-            input_ids (List(int)): list of input token ids
-            session (Session) session information
-            kwargs (dict): hoc parametrization of generation
-        """
-        with self.managed_generator(session) as generator:
-            for outputs in generator.stream_infer(
-                    session_id=session.session_id,
-                    input_ids=[input_ids],
-                    **kwargs,
-            ):
-                res, tokens = outputs[0]
-                yield res, tokens
-
-    def chat(
-        self,
-        query: str,
-        session: Optional[Session] = None,
-        cap: str = 'chat',
-        request_output_len: int = 512,
-        stream_output: bool = False,
-        ignore_eos=False,
-        random_seed: Optional[int] = None,
-        **kwargs,
-    ) -> Tuple[str, Session]:
-        """chat."""
-
-        if session is None:
-            session = Session()
-        assert session._error == 0, 'An error occurred before, ' \
-            'please start a new session.'
-
-        session._message.append([query, ''])
-
-        prompt = self.tm_model.model.get_prompt(query, session.nth_round == 0)
-        input_ids = self.tm_model.tokenizer.encode(prompt)
-
-        if len(
-                input_ids
-        ) + session.step + request_output_len >= self.tm_model.session_len:
-            logger.error(
-                f'session_length exceeded {self.tm_model.session_len}')
-            session._error = 1
-            yield '', session
-        else:
-            gen_param = get_gen_param(cap, self.generation_config,
-                                      session.nth_round + 1, session.step,
-                                      request_output_len, **kwargs)
-            gen_kwargs = dataclasses.asdict(gen_param)
-            gen_kwargs.update(
-                random_seed=random_seed if session.nth_round == 0 else None,
-                stream_output=stream_output,
-                ignore_eos=ignore_eos,
-                **kwargs)
-
-            _step = session._step
-            _nth_round = session._nth_round
-            response_size = 0
-
-            for res, tokens in self.generate(input_ids,
-                                             session=session,
-                                             **gen_kwargs):
-                response = self.tm_model.tokenizer.decode(res.tolist(),
-                                                          offset=response_size)
-                if response.endswith('�'):
-                    continue
-                response_size = tokens
-
-                session._message[-1][-1] += response
-                session._nth_round = _nth_round + 1
-                session._step = _step + response_size
-
-                yield response, session
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
 import copy
-import io
-import json
 import logging
 import os.path as osp
 import sys
 from configparser import ConfigParser
 from contextlib import contextmanager
-from queue import Queue
+from queue import LifoQueue, Queue
 from threading import Thread
-from typing import Iterable, List, Optional
+from typing import Iterable, List, Optional, Union

 import numpy as np
 import torch
-from huggingface_hub import snapshot_download
 from torch.nn.utils.rnn import pad_sequence

 import lmdeploy
-from lmdeploy.model import MODELS, BaseModel
+from lmdeploy.messages import (EngineGenerationConfig, ResponseType,
+                               TurbomindEngineConfig)
+from lmdeploy.model import (MODELS, BaseModel, ChatTemplateConfig,
+                            best_match_model)
 from lmdeploy.tokenizer import Tokenizer
-from lmdeploy.utils import get_logger
+from lmdeploy.utils import _stop_words, get_logger, get_model

 from .deploy.converter import (get_model_format, supported_formats,
                               update_config_weight_type, update_output_format)
 from .deploy.source_model.base import INPUT_MODELS
 from .deploy.target_model.base import OUTPUT_MODELS, TurbomindModelConfig
-from .utils import (ModelSource, check_tm_model_input, create_hf_download_args,
-                    get_hf_config_content, get_model_source)
+from .utils import ModelSource, get_model_from_config, get_model_source

 # TODO: find another way import _turbomind
 lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
 sys.path.append(osp.join(lmdeploy_dir, 'lib'))
 import _turbomind as _tm  # noqa: E402

-logger = logging.getLogger(__name__)
+logger = get_logger('lmdeploy')


-def _stop_words(stop_words: List[str], tokenizer: Tokenizer):
-    """return list of stop-words to numpy.ndarray."""
-    if stop_words is None:
+def _construct_stop_or_bad_words(words: List[int] = None):
+    if words is None or len(words) == 0:
        return None
-    assert isinstance(stop_words, List) and \
-        all(isinstance(elem, str) for elem in stop_words), \
-        f'stop_words must be a list but got {type(stop_words)}'
-    stop_words = [
-        tokenizer.encode(stop_word, False)[-1] for stop_word in stop_words
-    ]
-    assert isinstance(stop_words, List) and all(
-        isinstance(elem, int) for elem in stop_words), 'invalid stop_words'
-    # each id in stop_words represents a stop word
-    # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
-    # detailed explanation about fastertransformer's stop_words
-    stop_word_offsets = range(1, len(stop_words) + 1)
-    stop_words = np.array([[stop_words, stop_word_offsets]]).astype(np.int32)
-    return stop_words
+    offsets = range(1, len(words) + 1)
+    combined = np.array([[words, offsets]]).astype(np.int32)
+    return combined


 def _np_dict_to_tm_dict(np_dict: dict):
@@ -77,6 +64,59 @@ def _tm_dict_to_torch_dict(tm_dict: _tm.TensorMap):
    return ret


+def _update_engine_config(config: TurbomindEngineConfig, **kwargs):
+    if config is None:
+        config = TurbomindEngineConfig()
+    for k, v in kwargs.items():
+        if v and hasattr(config, k):
+            setattr(config, k, v)
+            logger.warning(f'kwargs {k} is deprecated to initialize model, '
+                           'use TurbomindEngineConfig instead.')
+    if config.model_name is not None:
+        logger.warning('model_name is deprecated in TurbomindEngineConfig '
+                       'and has no effect')
+    return config
+
+
+def _update_tm_config(dst: TurbomindModelConfig, src: TurbomindEngineConfig):
+    # A workaround to support max token number of each iteration in prefill
+    if src.max_prefill_token_num is not None and src.session_len is not None:
+        dst.num_tokens_per_iter = src.max_prefill_token_num
+        dst.max_prefill_iters = (src.session_len + src.max_prefill_token_num -
+                                 1) // src.max_prefill_token_num
+    dst_dict = copy.deepcopy(dst.__dict__)
+    src_dict = copy.deepcopy(src.__dict__)
+    src_dict['tensor_para_size'] = src_dict['tp']
+    for k, v in src_dict.items():
+        if v is not None and k in dst_dict:
+            dst_dict[k] = v
+    return TurbomindModelConfig.from_dict(dst_dict)
+
+
+def _compare_individual_gpu_memory(tp: int):
+    logger.setLevel(level=logging.INFO)
+    try:
+        total_mem = []
+        free_mem = []
+
+        for i in range(tp):
+            torch.cuda.set_device(i)
+            free, total = torch.cuda.mem_get_info()
+            total_mem.append(total / (1024**2))
+            free_mem.append(free / (1024**2))
+
+        all_total_equal = all(total == total_mem[0] for total in total_mem)
+        all_free_equal = all(free == free_mem[0] for free in free_mem)
+
+        if not all_total_equal or not all_free_equal:
+            logger.warning(
+                f'Memory discrepancy detected: Total Memory={total_mem} MB, \
+Free Memory={free_mem} MB')
+
+    except Exception as e:
+        logger.error(f'An exception occurred: {e}')
+
+
 @contextmanager
 def cuda_ctx(device_id):
    old_device = torch.cuda.current_device()
@@ -102,34 +142,75 @@ class TurboMind:

    def __init__(self,
                 model_path: str,
+                 engine_config: TurbomindEngineConfig = None,
                 model_source: ModelSource = ModelSource.WORKSPACE,
                 model_name: Optional[str] = None,
                 model_format: Optional[str] = None,
                 group_size: Optional[int] = None,
                 tp: Optional[int] = None,
+                 chat_template_config: Optional[ChatTemplateConfig] = None,
                 **kwargs):
+        # check memory equality when tp
        if tp is not None:
-            assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
-        self.gpu_count = tp if tp is not None else 1
+            if tp > 1:
+                _compare_individual_gpu_memory(tp)
+        elif engine_config is not None and engine_config.tp is not None:
+            if engine_config.tp > 1:
+                _compare_individual_gpu_memory(engine_config.tp)
+
+        # if loading from workspace and engine_config is None, use config.ini
+        # and ignore passed args like model_format, tp, etc.
+        if model_source == ModelSource.WORKSPACE and engine_config is None:
+
+            def _catch_args(**kwargs):
+                args = []
+                for k, v in kwargs.items():
+                    if v and hasattr(TurbomindEngineConfig, k):
+                        args.append(k)
+                return args
+
+            args = _catch_args(**kwargs, model_format=model_format, tp=tp)
+            if len(args) > 0:
+                logger.warning(
+                    f'loading from workspace, ignore args {args} '
+                    'please use TurbomindEngineConfig or modify config.ini')
+
+        else:
+            engine_config = _update_engine_config(engine_config,
+                                                  model_format=model_format,
+                                                  group_size=group_size,
+                                                  tp=tp,
+                                                  **kwargs)
+
+        tp = engine_config.tp if engine_config is not None else 1
+        assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
+        self.gpu_count = tp

        if model_source == ModelSource.WORKSPACE:
            tokenizer_model_path = osp.join(model_path, 'triton_models',
                                            'tokenizer')
            self.tokenizer = Tokenizer(tokenizer_model_path)
-            self.model_comm = self._from_workspace(model_path)
+            self.model_comm = self._from_workspace(model_path=model_path,
+                                                   engine_config=engine_config)
        else:
+            if not osp.exists(model_path):
+                model_path = get_model(model_path, engine_config.download_dir,
+                                       engine_config.revision)
            self.tokenizer = Tokenizer(model_path)
            self.model_comm = self._from_hf(model_source=model_source,
                                            model_path=model_path,
-                                            model_name=model_name,
-                                            model_format=model_format,
-                                            group_size=group_size,
-                                            tp=tp,
-                                            **kwargs)
-
+                                            engine_config=engine_config)
+
+        if chat_template_config:
+            if chat_template_config.model_name is None:
+                chat_template_config.model_name = self.model_name
+                logger.warning(f'Input chat template with model_name is None. '
+                               f'Forcing to use {self.model_name}')
+            self.model = chat_template_config.chat_template
+        else:
+            self.model: BaseModel = MODELS.get(self.model_name)(**kwargs)
+        self.session_len = self.config.session_len
        self.eos_id = self.tokenizer.eos_token_id
-        self.model: BaseModel = MODELS.get(self.model_name)(**kwargs)
-        self.session_len = self.model.session_len
        self.stop_words = _stop_words(self.model.stop_words, self.tokenizer)

    def _create_weight(self, model_comm):
@@ -194,88 +275,61 @@ class TurboMind:
                    tm_params[k] = []
                tm_params[k].append(v)

-    def _from_hf(self,
-                 model_source: ModelSource,
-                 model_path: str,
-                 model_name: Optional[str] = None,
-                 model_format: Optional[str] = None,
-                 group_size: Optional[int] = None,
-                 tp: Optional[int] = None,
-                 **kwargs):
+    def _from_hf(self, model_source: ModelSource, model_path: str,
+                 engine_config: TurbomindEngineConfig):
        """Load model which is in hf format."""
-        # get model_name, group_size if is lmdeploy managed.
-        if model_source == ModelSource.HF_LMDEPLOY:
-            config = get_hf_config_content(model_path, local_files_only=True)
-            tm_config = config['turbomind']
-            tm_config.update(kwargs)
-            var_shoud_be_none = dict(model_name=model_name,
-                                     model_format=model_format,
-                                     group_size=group_size)
-            for key, value in var_shoud_be_none.items():
-                assert value is None, f'{key} should be None when model is '\
-                    f'from {model_source}'
-            model_name = tm_config['model_name']
-            group_size = tm_config['group_size']
-            if tm_config['weight_type'] == 'int4':
-                model_format = 'awq'
-        else:
-            assert model_name is not None, 'please supply model_name when ' \
-                f'model is form {model_source}'
-            if osp.exists(osp.join(model_path, 'outputs_stats.pth')):
-                model_format = 'awq' if model_format is None else model_format
-                group_size = 128 if group_size is None else group_size
-            tm_config = kwargs
-
-        assert model_name in MODELS.module_dict.keys(), \
-            f"'{model_name}' is not supported. " \
-            f'The supported models are: {MODELS.module_dict.keys()}'
-        assert model_format in supported_formats, 'the model format ' \
-            f'should be in {supported_formats}'
-
+        assert model_source == ModelSource.HF_MODEL, \
+            f'{model_source} is not supported'
+        assert engine_config.model_format in supported_formats, \
+            f'The model format should be in {supported_formats}'
+
+        # update model_format if not supplied and outputs_stats.pth exists
+        if osp.exists(osp.join(model_path, 'outputs_stats.pth')) and \
+                engine_config.model_format is None:
+            engine_config.model_format = 'awq'
+
+        # when convert model, use architectures in config.json
+        model_arch = get_model_from_config(model_path)
        data_type = 'fp16'
        output_format = 'fp16'
-        inferred_model_format = get_model_format(model_name, model_format)
-        cfg = TurbomindModelConfig.from_dict(tm_config, allow_none=True)
-
-        # overwrite with input params
-        cfg.model_name = model_name
-        cfg.tensor_para_size = 1 if tp is None else tp
-        cfg.rotary_embedding = cfg.size_per_head
-        cfg.group_size = group_size
+        inferred_model_format = get_model_format(model_arch,
+                                                 engine_config.model_format)
+        cfg = TurbomindModelConfig.from_engine_config(engine_config)
+        match_name = best_match_model(model_path)
+        # for session len
+        cfg.model_name = match_name \
+            if match_name is not None else 'base'
        if inferred_model_format.find('awq') != -1:
            cfg.weight_type = 'int4'
            output_format = 'w4'
            data_type = 'int4'
-            assert group_size > 0, f'group_size: {group_size} should > 0'
+            cfg.group_size = 128
        else:
-            output_format = update_output_format(model_name,
+            output_format = update_output_format(cfg.model_name,
                                                 inferred_model_format,
                                                 model_path, output_format)
            data_type = output_format
            update_config_weight_type(output_format, cfg)

-        self.config = cfg
-        self.model_name = model_name
-        self.data_type = data_type
-
        input_model = INPUT_MODELS.get(inferred_model_format)(
            model_path=model_path, tokenizer_path=model_path, ckpt_path=None)

        output_model = OUTPUT_MODELS.get(output_format)(
            input_model=input_model, cfg=cfg, to_file=False, out_dir='')

-        config = copy.deepcopy(output_model.cfg.__dict__)
-        logger.warning(f'model_config:\n{json.dumps(config, indent=2)}')
-        parser = ConfigParser()
-        parser['llama'] = config
-        with io.StringIO() as ss:
-            parser.write(ss)
-            ss.seek(0)
-            config = ss.read()
+        cfg = output_model.cfg
+        if engine_config.session_len is not None:
+            cfg.session_len = engine_config.session_len
+
+        self.model_name = cfg.model_name
+        self.config = cfg
+        self.data_type = data_type
+
+        logger.warning(f'model_config:\n\n{cfg.toini()}')

        model_comm = _tm.AbstractTransformerModel.create_llama_model(
            model_dir='',
-            config=config,
+            config=cfg.toini(),
            tensor_para_size=self.gpu_count,
            data_type=data_type)

@@ -289,35 +343,48 @@ class TurboMind:
        output_model.export()

        # load kv qparams
-        self._load_kv_qparams(model_path, tm_params, **kwargs)
+        self._load_kv_qparams(model_path, tm_params, kv_sym=False, kv_bits=8)
        assert len(tm_params) == 0, f'missing {tm_params.keys()}'

        return model_comm

-    def _from_workspace(self, model_path: str):
+    def _from_workspace(self, model_path: str,
+                        engine_config: TurbomindEngineConfig):
        """Load model which is converted by `lmdeploy convert`"""
        ini_path = osp.join(model_path, 'triton_models', 'weights',
                            'config.ini')
+        # load cfg
        with open(ini_path, 'r') as f:
            parser = ConfigParser()
            parser.read_file(f)
-            section_name = 'llama'
-            tp_cfg = parser.getint(section_name, 'tensor_para_size')
-
-            if tp_cfg != 1 and tp_cfg != self.gpu_count:
-                get_logger('turbomind').info(
-                    f'found tp={tp_cfg} in config.ini.')
-                self.gpu_count = tp_cfg
-            self.model_name = parser.get(section_name, 'model_name')
-            self.data_type = parser.get(section_name, 'weight_type')
-            cfg = parser._sections[section_name]
-            cfg = TurbomindModelConfig.from_dict(cfg)
-            self.config = cfg
+        section_name = 'llama'
+        _cfg = parser._sections[section_name]
+        cfg = TurbomindModelConfig.from_dict(_cfg)
+
+        # check whether input tp is valid
+        if cfg.tensor_para_size != 1 and \
+                self.gpu_count != cfg.tensor_para_size:
+            logger.info(f'found tp={cfg.tensor_para_size} in config.ini.')
+            self.gpu_count = cfg.tensor_para_size
+
+        # update cfg
+        if engine_config is not None:
+            engine_config.tp = cfg.tensor_para_size
+            cfg = _update_tm_config(cfg, engine_config)
+            if engine_config.session_len is not None:
+                cfg.session_len = engine_config.session_len
+
+        # update cls
+        self.config = cfg
+        self.model_name = cfg.model_name
+        self.data_type = cfg.weight_type

        # create model
+        logger.warning(f'model_config:\n\n{cfg.toini()}')
        weight_dir = osp.join(model_path, 'triton_models', 'weights')
        model_comm = _tm.AbstractTransformerModel.create_llama_model(
-            weight_dir,
+            model_dir=weight_dir,
+            config=cfg.toini(),
            tensor_para_size=self.gpu_count,
            data_type=self.data_type)

@@ -326,13 +393,16 @@ class TurboMind:
        return model_comm

    @classmethod
-    def from_pretrained(cls,
-                        pretrained_model_name_or_path: str,
-                        model_name: Optional[str] = None,
-                        model_format: Optional[str] = None,
-                        group_size: Optional[int] = None,
-                        tp: Optional[int] = None,
-                        **kwargs):
+    def from_pretrained(
+            cls,
+            pretrained_model_name_or_path: str,
+            engine_config: TurbomindEngineConfig = None,
+            model_name: Optional[str] = None,
+            model_format: Optional[str] = None,
+            group_size: Optional[int] = None,
+            tp: Optional[int] = None,
+            chat_template_config: Optional[ChatTemplateConfig] = None,
+            **kwargs):
        """LMDeploy's turbomind inference engine.

        Args:
@@ -346,7 +416,7 @@ class TurboMind:
                      "InternLM/internlm-chat-20b-4bit",
                      "lmdeploy/llama2-chat-70b-4bit", etc.
                    - iii) The model_id of a model hosted inside a model repo
-                      on huggingface.co, such as "InternLM/internlm-chat-7b",
+                      on huggingface.co, such as "internlm/internlm-chat-7b",
                      "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
                      and so on.
            model_name (str): needed when pretrained_model_name_or_path is c)
@@ -357,26 +427,14 @@ class TurboMind:
                Can be used to update configuration when initialize the engine.
        """
        model_source = get_model_source(pretrained_model_name_or_path)
-        if model_source == ModelSource.WORKSPACE:
-            local_path = pretrained_model_name_or_path
-        else:
-            check_tm_model_input(pretrained_model_name_or_path,
-                                 model_name=model_name,
-                                 **kwargs)
-            if not osp.exists(pretrained_model_name_or_path):
-                download_kwargs = create_hf_download_args(**kwargs)
-                local_path = snapshot_download(pretrained_model_name_or_path,
-                                               **download_kwargs)
-            else:
-                local_path = pretrained_model_name_or_path
-
        logger.warning(f'model_source: {model_source}')
-        return cls(model_source=model_source,
-                   model_path=local_path,
-                   model_name=model_name,
+        return cls(model_path=pretrained_model_name_or_path,
+                   engine_config=engine_config,
+                   model_source=model_source,
                   model_format=model_format,
                   group_size=group_size,
                   tp=tp,
+                   chat_template_config=chat_template_config,
                   **kwargs)

    def create_instance(self, cuda_stream_id=0):
@@ -406,8 +464,6 @@ class TurboMindInstance:
        self.gpu_count = tm_model.gpu_count

        self.stop_words = tm_model.stop_words
-        self.stop_tokens = [] if self.stop_words is None else \
-            self.stop_words.flatten().tolist()
        self.eos_id = tm_model.eos_id
        self.session_len = tm_model.session_len

@@ -456,23 +512,92 @@ class TurboMindInstance:
            t.start()
            self.threads[device_id] = t

+    def _async_forward_callback(self, result, ctx, que: LifoQueue):
+        que.put((False, result))
+
+    def _async_forward_thread(self, inputs, que: LifoQueue):
+        instance_comm = self.tm_model.model_comm.create_instance_comm(
+            self.gpu_count)
+
+        def _func(device_id, enque_output):
+            with cuda_ctx(device_id):
+                output = self.model_insts[device_id].forward(
+                    inputs, instance_comm)
+                if enque_output:
+                    que.put((True, output))
+
+        for device_id in range(self.gpu_count):
+            t = Thread(target=_func,
+                       args=(device_id, device_id == 0),
+                       daemon=True)
+            t.start()
+            self.threads[device_id] = t
+
+    def _update_generation_config(self, config: EngineGenerationConfig,
+                                  **kwargs: dict):
+        if config is None:
+            config = EngineGenerationConfig()
+        # backward compatibility
+        # if doesn't supply stop words, use default
+        if config.stop_words is None and self.stop_words is not None:
+            config.stop_words = self.stop_words[0][0].tolist()
+
+        deprecated_kwargs = []
+        for k, v in kwargs.items():
+            if k in config.__dict__:
+                config.__dict__[k] = v
+                deprecated_kwargs.append(k)
+        if 'request_output_len' in kwargs:
+            config.max_new_tokens = kwargs['request_output_len']
+            deprecated_kwargs.append('request_output_len')
+        for k in deprecated_kwargs:
+            logger.warning(f'kwargs {k} is deprecated for inference, '
+                           'use GenerationConfig instead.')
+        return config
+
+    def end(self, session_id: int):
+        """End the given session."""
+        input_ids = [self.tm_model.tokenizer.eos_token_id]
+        end_generator = self.tm_model.create_instance()
+        for outputs in end_generator.stream_infer(session_id,
+                                                  input_ids,
+                                                  request_output_len=0,
+                                                  sequence_start=False,
+                                                  sequence_end=True):
+            pass
+
+    async def async_end(self, session_id: int):
+        """End the given session."""
+        self.end(session_id)
+        await asyncio.sleep(0.002)
+
+    def cancel(self, session_id: int):
+        """Stop current streaming inference."""
+        input_ids = [self.tm_model.tokenizer.eos_token_id]
+        stop_generator = self.tm_model.create_instance()
+        for outputs in stop_generator.stream_infer(session_id,
+                                                   input_ids,
+                                                   request_output_len=0,
+                                                   sequence_start=False,
+                                                   sequence_end=False,
+                                                   stop=True):
+            pass
+
+    async def async_cancel(self, session_id: int):
+        """End the given session."""
+        self.cancel(session_id)
+        await asyncio.sleep(0.002)
+
    def prepare_inputs(self,
                       session_id,
                       input_ids,
+                       gen_config: EngineGenerationConfig,
                       input_embeddings=None,
                       input_embedding_ranges=None,
-                       request_output_len: int = 512,
                       sequence_start: bool = True,
                       sequence_end: bool = False,
                       step=0,
-                       stop=False,
-                       top_p=0.8,
-                       top_k=40,
-                       temperature=0.8,
-                       repetition_penalty=1.0,
-                       ignore_eos=False,
-                       random_seed=None,
-                       stream_output=False):
+                       stop=False):
        """Convert inputs format."""
        if len(input_ids) == 0:
            input_ids = [[]]
@@ -504,19 +629,16 @@ class TurboMindInstance:
            input_ids=input_ids,
            input_lengths=input_lengths,
            request_output_len=np.full(input_lengths.shape,
-                                       request_output_len,
+                                       gen_config.max_new_tokens,
                                       dtype=np.uint32),
-            runtime_top_k=_broadcast_np(top_k, np.uint32),
-            runtime_top_p=_broadcast_np(top_p, np.float32),
-            temperature=_broadcast_np(temperature, np.float32),
-            repetition_penalty=_broadcast_np(repetition_penalty, np.float32),
+            runtime_top_k=_broadcast_np(gen_config.top_k, np.uint32),
+            runtime_top_p=_broadcast_np(gen_config.top_p, np.float32),
+            temperature=_broadcast_np(gen_config.temperature, np.float32),
+            repetition_penalty=_broadcast_np(gen_config.repetition_penalty,
+                                             np.float32),
            step=step,

            # session input
-            session_len=self.session_len *
-            np.ones([
-                batch_size,
-            ], dtype=np.uint32),
            START=_broadcast_np((1 if sequence_start else 0), np.int32),
            END=_broadcast_np((1 if sequence_end else 0), np.int32),
            CORRID=np.array(session_id, dtype=np.uint64),
@@ -560,20 +682,29 @@ class TurboMindInstance:
            inputs['input_embeddings'] = input_embeddings
            inputs['input_embedding_ranges'] = input_embedding_ranges

-        if ignore_eos:
+        if gen_config.min_new_tokens is not None:
+            inputs['min_length'] = _broadcast_np(gen_config.min_new_tokens,
+                                                 np.int32)
+
+        bad_words = []
+        if gen_config.bad_words is not None:
+            bad_words.extend(gen_config.bad_words)
+        if gen_config.ignore_eos:
            stop_words = None
-            bad_words = torch.tensor([[[self.eos_id], [1]]], dtype=torch.int32)
+            bad_words.append(self.eos_id)
        else:
-            stop_words = self.stop_words
-            bad_words = None
+            stop_words = gen_config.stop_words
+        stop_words = _construct_stop_or_bad_words(stop_words)
+        bad_words = _construct_stop_or_bad_words(bad_words)

        if stop_words is not None:
            inputs['stop_words_list'] = stop_words
        if bad_words is not None:
            inputs['bad_words_list'] = bad_words

-        if random_seed is not None:
-            inputs['random_seed'] = _broadcast_np(random_seed, np.uint64)
+        if gen_config.random_seed is not None:
+            inputs['random_seed'] = _broadcast_np(gen_config.random_seed,
+                                                  np.uint64)
        return inputs, input_lengths

    async def async_stream_infer(self,
@@ -581,18 +712,13 @@ class TurboMindInstance:
                                 input_ids,
                                 input_embeddings=None,
                                 input_embedding_ranges=None,
-                                 request_output_len: int = 512,
                                 sequence_start: bool = True,
                                 sequence_end: bool = False,
                                 step=0,
                                 stop=False,
-                                 top_p=0.8,
-                                 top_k=40,
-                                 temperature=0.8,
-                                 repetition_penalty=1.0,
-                                 ignore_eos=False,
-                                 random_seed=None,
-                                 stream_output=False):
+                                 gen_config: EngineGenerationConfig = None,
+                                 stream_output=False,
+                                 **kwargs):
        """Perform model inference.

        Args:
@@ -601,60 +727,46 @@ class TurboMindInstance:
            input_embeddings (List[numpy.ndarray]): embeddings features
            input_embedding_ranges (List[Tuple[int,int]]): the begin/end
              offsets of input_embeddings to input_ids
-            request_output_len (int): the max number of to-be-generated tokens
            sequence_start (bool): indicator for starting a sequence
            sequence_end (bool): indicator for ending a sequence
            step (int): the offset of the k/v cache
            stop (bool): indicator for cancelling the session
-            top_p (float): If set to float < 1, only the smallest set of most
-              probable tokens with probabilities that add up to top_p or higher
-            are kept for generation.
-            top_k (int): The number of the highest probability vocabulary
-              tokens to keep for top-k-filtering
-            temperature (float): to modulate the next token probability
-            repetition_penalty (float): The parameter for repetition penalty.
-              1.0 means no penalty
-            ignore_eos (bool): indicator for ignoring eos
-            random_seed (int): seed used by sampling
+            gen_config (EngineGenerationConfig): generation config
            stream_output (bool): indicator for stream output
+            kwargs (dict): kwargs for backward compatibility
        """
+        # start forward thread
+        que = LifoQueue()
+        from functools import partial
+        _forward_callback = partial(self._async_forward_callback, que=que)
+        _forward_thread = partial(self._async_forward_thread, que=que)
        if stream_output and not stop:
-            self.model_insts[0].register_callback(self._forward_callback)
+            self.model_insts[0].register_callback(_forward_callback)
+
+        gen_config = self._update_generation_config(gen_config, **kwargs)
        inputs, input_lengths = self.prepare_inputs(
            session_id=session_id,
            input_ids=input_ids,
            input_embeddings=input_embeddings,
            input_embedding_ranges=input_embedding_ranges,
-            request_output_len=request_output_len,
            sequence_start=sequence_start,
            sequence_end=sequence_end,
            step=step,
            stop=stop,
-            top_p=top_p,
-            top_k=top_k,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            ignore_eos=ignore_eos,
-            random_seed=random_seed,
-            stream_output=stream_output)
+            gen_config=gen_config)

        tm_inputs = _np_dict_to_tm_dict(inputs)
-        # start forward thread
-        self.que = Queue()
-        self._forward_thread(tm_inputs)
+        _forward_thread(tm_inputs)

        seq_start = input_lengths + input_lengths.new_tensor(step)

+        prev_len = 0
        # generator
        while True:
-            # Thanks for https://github.com/frankxyy and his issue
-            # https://github.com/InternLM/lmdeploy/issues/832
-            while self.que.qsize() == 0:
-                await asyncio.sleep(0)
-            while self.que.qsize() > 1:
-                self.que.get()
+            while que.qsize() == 0:  # let other requests in
+                await asyncio.sleep(0.002)

-            finish, tm_outputs = self.que.get()
+            finish, tm_outputs = que.get()

            outputs = _tm_dict_to_torch_dict(tm_outputs)

@@ -667,22 +779,27 @@ class TurboMindInstance:
            sequence_length -= seq_start.to(sequence_length.device)

            outputs = []
+            status = ResponseType.FINISH if finish else ResponseType.SUCCESS
            for output, len_ in zip(output_ids, sequence_length):
                output, len_ = output, len_.item()
-                if len(output) > 0 and output[-1].item(
-                ) == self.eos_id and not ignore_eos:
-                    outputs.append((output[:-1], len_ - 1))
-                elif len(output) > 0 and output[-1].item() in self.stop_tokens:
-                    outputs.append((output[:-1], len_))
+                if len(output) > 0 and output[-1].item() == self.eos_id \
+                        and not gen_config.ignore_eos:
+                    outputs = (status, output[:-1].tolist(), len_ - 1)
+                elif len(output) > 0 and \
+                    gen_config.stop_words is not None and \
+                        output[-1].item() in gen_config.stop_words:
+                    outputs = (status, output[:-1].tolist(), len_)
                else:
-                    outputs.append((output, len_))
+                    outputs = (status, output.tolist(), len_)
+            if outputs[-1] < prev_len and not finish:
+                continue
+            else:
+                prev_len = outputs[-1]
            yield outputs

            if finish:
                for t in self.threads:
                    t.join()
-                while self.que.qsize() > 0:
-                    self.que.get()
                break

        if stream_output and not stop:
@@ -693,18 +810,13 @@ class TurboMindInstance:
                     input_ids,
                     input_embeddings=None,
                     input_embedding_ranges=None,
-                     request_output_len: int = 512,
                     sequence_start: bool = True,
                     sequence_end: bool = False,
                     step=0,
                     stop=False,
-                     top_p=0.8,
-                     top_k=40,
-                     temperature=0.8,
-                     repetition_penalty=1.0,
-                     ignore_eos=False,
-                     random_seed=None,
-                     stream_output=False):
+                     gen_config: EngineGenerationConfig = None,
+                     stream_output=False,
+                     **kwargs):
        """Perform model inference.

        Args:
@@ -713,42 +825,28 @@ class TurboMindInstance:
            input_embeddings (List[numpy.ndarray]): embeddings features
            input_embedding_ranges (List[Tuple[int,int]]): the begin/end
              offsets of input_embeddings to input_ids
-            request_output_len (int): the max number of to-be-generated tokens
            sequence_start (bool): indicator for starting a sequence
            sequence_end (bool): indicator for ending a sequence
            step (int): the offset of the k/v cache
            stop (bool): indicator for cancelling the session
-            top_p (float): If set to float < 1, only the smallest set of most
-              probable tokens with probabilities that add up to top_p or higher
-            are kept for generation.
-            top_k (int): The number of the highest probability vocabulary
-              tokens to keep for top-k-filtering
-            temperature (float): to modulate the next token probability
-            repetition_penalty (float): The parameter for repetition penalty.
-              1.0 means no penalty
-            ignore_eos (bool): indicator for ignoring eos
-            random_seed (int): seed used by sampling
+            gen_config (EngineGenerationConfig): generation config
            stream_output (bool): indicator for stream output
+            kwargs (dict): kwargs for backward compatibility
        """
        if stream_output and not stop:
            self.model_insts[0].register_callback(self._forward_callback)
+
+        gen_config = self._update_generation_config(gen_config, **kwargs)
        inputs, input_lengths = self.prepare_inputs(
            session_id=session_id,
            input_ids=input_ids,
            input_embeddings=input_embeddings,
            input_embedding_ranges=input_embedding_ranges,
-            request_output_len=request_output_len,
            sequence_start=sequence_start,
            sequence_end=sequence_end,
            step=step,
            stop=stop,
-            top_p=top_p,
-            top_k=top_k,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            ignore_eos=ignore_eos,
-            random_seed=random_seed,
-            stream_output=stream_output)
+            gen_config=gen_config)

        tm_inputs = _np_dict_to_tm_dict(inputs)
        # start forward thread
@@ -775,15 +873,18 @@ class TurboMindInstance:
            sequence_length -= seq_start.to(sequence_length.device)

            outputs = []
+            status = ResponseType.FINISH if finish else ResponseType.SUCCESS
            for output, len_ in zip(output_ids, sequence_length):
                output, len_ = output, len_.item()
-                if len(output) > 0 and output[-1].item(
-                ) == self.eos_id and not ignore_eos:
-                    outputs.append((output[:-1], len_ - 1))
-                elif len(output) > 0 and output[-1].item() in self.stop_tokens:
-                    outputs.append((output[:-1], len_))
+                if len(output) > 0 and output[-1].item() == self.eos_id \
+                        and not gen_config.ignore_eos:
+                    outputs = (status, output[:-1].tolist(), len_ - 1)
+                elif len(output) > 0 and \
+                    gen_config.stop_words is not None and \
+                        output[-1].item() in gen_config.stop_words:
+                    outputs = (status, output[:-1].tolist(), len_)
                else:
-                    outputs.append((output, len_))
+                    outputs = (status, output.tolist(), len_)
            yield outputs

            if finish:
@@ -796,17 +897,27 @@ class TurboMindInstance:
        if stream_output and not stop:
            self.model_insts[0].unregister_callback()

-    def decode(self, input_ids):
+    def decode(self,
+               input_ids,
+               steps: List[int] = None,
+               sequence_start: bool = True,
+               sequence_end: bool = True):
        """Perform context decode on input tokens.

        Args:
            input_ids (numpy.ndarray): the batch of input token ids
+            steps (List[int]): the offset of the k/v cache
+            sequence_start (bool): indicator for starting a sequence
+            sequence_end (bool): indicator for ending a sequence
        """

        if len(input_ids) == 0:
            input_ids = [[]]
        if isinstance(input_ids[0], int):
            input_ids = [input_ids]
+        if steps is None:
+            steps = [0] * len(input_ids)
+        assert isinstance(steps, List) and len(steps) == len(input_ids)

        # append an extra token since input_len-1 tokens will be
        # decoded by context decoder
@@ -827,11 +938,16 @@ class TurboMindInstance:
        input_ids = pad_sequence(input_ids,
                                 batch_first=True,
                                 padding_value=self.eos_id)
+        steps = torch.IntTensor([step for step in steps])

        inputs = dict(input_ids=input_ids,
                      input_lengths=input_lengths,
                      request_output_len=_broadcast_np(0, dtype=np.uint32),
-                      is_return_logits=_broadcast_np(1, np.uint32))
+                      is_return_logits=_broadcast_np(1, np.uint32),
+                      START=_broadcast_np((1 if sequence_start else 0),
+                                          np.int32),
+                      END=_broadcast_np((1 if sequence_end else 0), np.int32),
+                      step=steps)

        tm_inputs = _np_dict_to_tm_dict(inputs)

@@ -844,3 +960,83 @@ class TurboMindInstance:
        logits = outputs['logits']

        return logits[:, :-1, :]
+
+    def get_ppl(self, input_ids: Union[List[int], List[List[int]]]):
+        """Get perplexity scores given a list of input tokens.
+
+        Args:
+            input_ids (Union[List[int], List[List[int]]]): the batch of input token ids
+        """  # noqa 501
+
+        if len(input_ids) == 0:
+            input_ids = [[]]
+        if isinstance(input_ids[0], int):
+            input_ids = [input_ids]
+
+        max_input_len = 16 * 1024
+        # max_input_len = 16
+        n_max_iter = np.ceil(
+            max([len(input_id)
+                 for input_id in input_ids]) / max_input_len).astype(int)
+
+        device = 'cpu' if n_max_iter > 1 else 'cuda'
+
+        index_range_starts = []
+        index_range_ends = []
+        for input_id in input_ids:
+            index_range_start = np.array(
+                [i * max_input_len for i in range(n_max_iter)])
+            index_range_end = index_range_start + max_input_len
+            index_range_start[index_range_start >= len(input_id)] = len(
+                input_id)
+            index_range_end[index_range_end >= len(input_id)] = len(input_id)
+            index_range_starts.append(index_range_start)
+            index_range_ends.append(index_range_end)
+
+        logits = []
+        for i in range(n_max_iter):
+            steps = [start[i] for start in index_range_starts]
+            _input_ids = [
+                input_id[start[i]:end[i]] for input_id, start, end in zip(
+                    input_ids, index_range_starts, index_range_ends)
+            ]
+            _logits = self.decode(_input_ids,
+                                  steps,
+                                  sequence_start=(i == 0),
+                                  sequence_end=(i == n_max_iter - 1))
+            _logits = _logits.to(device=device)
+            logits.append(_logits)
+
+        # concat logits. Shape is [bsz, seq_len, vocab_size]
+        logits = torch.cat(logits, dim=1)
+
+        # get target ids
+        padding_token_id = -100
+        target_ids = [(_input_ids + [padding_token_id])[1:]
+                      for _input_ids in input_ids]
+        target_ids = [
+            torch.Tensor(torch.LongTensor(_target_ids))
+            for _target_ids in target_ids
+        ]
+        target_ids = pad_sequence(target_ids,
+                                  batch_first=True,
+                                  padding_value=padding_token_id)
+        target_ids = target_ids.to(logits.device)
+        target_mask = target_ids != padding_token_id
+        target_count = torch.sum(target_mask, dim=-1)
+
+        # compute cross entropy loss
+        bsz, seq_len, vocab_size = logits.shape
+        flat_logits = logits.contiguous().view(-1, vocab_size)
+        flat_target_ids = target_ids.contiguous().view(-1)
+        flat_loss_matrix = torch.nn.functional.cross_entropy(
+            flat_logits,
+            flat_target_ids,
+            reduction='none',
+            ignore_index=padding_token_id)
+
+        loss_matrix = flat_loss_matrix.view(bsz, seq_len)
+        loss_sum = torch.sum(loss_matrix * target_mask, dim=1)
+        loss_avg = loss_sum / target_count
+        loss_avg = loss_avg.cpu().numpy()
+        return loss_avg
--- a/lmdeploy/turbomind/utils.py
+++ b/lmdeploy/turbomind/utils.py
 # Copyright (c) OpenMMLab. All rights reserved.
-import dataclasses
-import json
-import logging
 import os

-from huggingface_hub import hf_hub_download
 from transformers.utils import ExplicitEnum

-logger = logging.getLogger(__name__)
+from lmdeploy.utils import get_logger
+
+logger = get_logger('lmdeploy')


 class ModelSource(ExplicitEnum):
    """Turbomind model source."""
    WORKSPACE = 'workspace'
    HF_MODEL = 'hf_model'
-    HF_LMDEPLOY = 'hf_lmdeploy'
-
-
-def create_hf_download_args(**kwargs) -> dict:
-    download_kwargs = {
-        'revision': None,
-        'cache_dir': None,
-        'proxies': None,
-        'resume_download': True,
-        'force_download': False,
-        'token': None,
-        'local_files_only': False
-    }
-    for k in download_kwargs.keys():
-        if k in kwargs:
-            download_kwargs[k] = kwargs[k]
-    return download_kwargs
-
-
-def get_hf_config_path(pretrained_model_name_or_path, **kwargs) -> str:
-    """Get local hf config local file path."""
-    if os.path.exists(pretrained_model_name_or_path):
-        config_path = os.path.join(pretrained_model_name_or_path,
-                                   'config.json')
-    else:
-        download_kwargs = create_hf_download_args(**kwargs)
-        config_path = hf_hub_download(pretrained_model_name_or_path,
-                                      'config.json', **download_kwargs)
-    return config_path
-
-
-def get_hf_config_content(pretrained_model_name_or_path, **kwargs) -> dict:
-    """Get config content of a hf model."""
-    config_path = get_hf_config_path(pretrained_model_name_or_path, **kwargs)
-    with open(config_path, 'r') as f:
-        config = json.load(f)
-    return config


 def get_model_source(pretrained_model_name_or_path: str,
@@ -60,61 +21,33 @@ def get_model_source(pretrained_model_name_or_path: str,
                                     'triton_models')
    if os.path.exists(triton_model_path):
        return ModelSource.WORKSPACE
-    config = get_hf_config_content(pretrained_model_name_or_path, **kwargs)
-    model_source = ModelSource.HF_LMDEPLOY if 'turbomind' in config \
-        else ModelSource.HF_MODEL
-    return model_source
-
-
-def check_tm_model_input(pretrained_model_name_or_path, **kwargs):
-    """Check if single input pretrained_model_name_or_path is enough to use."""
-    if kwargs.get('model_name', None):
-        return
-
-    model_source = get_model_source(pretrained_model_name_or_path, **kwargs)
-    if model_source == ModelSource.WORKSPACE:
-        return
+    return ModelSource.HF_MODEL

-    config = get_hf_config_content(pretrained_model_name_or_path, **kwargs)
-    if 'turbomind' in config and config['turbomind']['model_name'] != '':
-        return

-    assert (0), '\nCan not get model name from input model, '\
-        'please supply model name with arg --model-name,' \
-        'you can list supported models by `lmdeploy list`'
+def get_model_from_config(model_dir: str):
+    import json
+    config_file = os.path.join(model_dir, 'config.json')
+    default = 'llama'
+    if not os.path.exists(config_file):
+        return default

+    with open(config_file) as f:
+        config = json.load(f)

-@dataclasses.dataclass
-class GenParam:
-    top_p: float
-    top_k: float
-    temperature: float
-    repetition_penalty: float
-    sequence_start: bool = False
-    sequence_end: bool = False
-    step: int = 0
-    request_output_len: int = 512
-
+    ARCH_MAP = {
+        'LlavaLlamaForCausalLM': default,
+        'LlamaForCausalLM': default,
+        'InternLM2ForCausalLM': 'internlm2',
+        'InternLMForCausalLM': default,
+        'BaiChuanForCausalLM': 'baichuan',  # Baichuan-7B
+        'BaichuanForCausalLM': 'baichuan2',  # not right for Baichuan-13B-Chat
+        'QWenLMHeadModel': 'qwen',
+    }

-def get_gen_param(cap,
-                  sampling_param,
-                  nth_round,
-                  step,
-                  request_output_len=512,
-                  **kwargs):
-    """return parameters used by token generation."""
-    gen_param = GenParam(**dataclasses.asdict(sampling_param),
-                         request_output_len=request_output_len)
-    # Fix me later. turbomind.py doesn't support None top_k
-    if gen_param.top_k is None:
-        gen_param.top_k = 40
+    arch = 'LlamaForCausalLM'
+    if 'auto_map' in config:
+        arch = config['auto_map']['AutoModelForCausalLM'].split('.')[-1]
+    elif 'architectures' in config:
+        arch = config['architectures'][0]

-    if cap == 'chat':
-        gen_param.sequence_start = (nth_round == 1)
-        gen_param.sequence_end = False
-        gen_param.step = step
-    else:
-        gen_param.sequence_start = True
-        gen_param.sequence_end = True
-        gen_param.step = 0
-    return gen_param
+    return ARCH_MAP[arch]
--- a/lmdeploy/utils.py
+++ b/lmdeploy/utils.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
+import functools
+import json
 import logging
+import os
+import sys
+import time
+from contextlib import contextmanager
+from logging import Logger, LogRecord
 from typing import List, Optional

+from huggingface_hub import hf_hub_download
+
 logger_initialized = {}


-def get_logger(name: str,
-               log_file: Optional[str] = None,
-               log_level: int = logging.INFO,
-               file_mode: str = 'w'):
+class _ASNI_COLOR:
+    BRIGHT_RED = '\033[91m'
+    RED = '\033[31m'
+    YELLOW = '\033[33m'
+    WHITE = '\033[37m'
+    GREEN = '\033[32m'
+
+
+class ColorFormatter(logging.Formatter):
+
+    _LEVELNAME_COLOR_MAP = dict(CRITICAL=_ASNI_COLOR.BRIGHT_RED,
+                                ERROR=_ASNI_COLOR.RED,
+                                WARN=_ASNI_COLOR.YELLOW,
+                                WARNING=_ASNI_COLOR.YELLOW,
+                                INFO=_ASNI_COLOR.WHITE,
+                                DEBUG=_ASNI_COLOR.GREEN)
+
+    _RESET_COLOR = '\033[0m'
+
+    def format(self, record: LogRecord):
+        """format."""
+        if sys.platform == 'win32':
+            # windows does not support ASNI color
+            return super().format(record)
+        levelname = record.levelname
+        level_color = self._LEVELNAME_COLOR_MAP.get(levelname,
+                                                    self._RESET_COLOR)
+        levelname = f'{level_color}{levelname}{self._RESET_COLOR}'
+        record.levelname = levelname
+        return super().format(record)
+
+
+class FilterDuplicateWarning(logging.Filter):
+    """Filter the repeated warning message.
+
+    Args:
+        name (str): name of the filter.
+    """
+
+    def __init__(self, name: str = 'lmdeploy'):
+        super().__init__(name)
+        self.seen: set = set()
+
+    def filter(self, record: LogRecord) -> bool:
+        """Filter the repeated warning message.
+
+        Args:
+            record (LogRecord): The log record.
+
+        Returns:
+            bool: Whether to output the log record.
+        """
+        if record.levelno != logging.WARNING:
+            return True
+
+        if record.msg not in self.seen:
+            self.seen.add(record.msg)
+            return True
+        return False
+
+
+def get_logger(
+    name: Optional[str] = None,
+    log_file: Optional[str] = None,
+    log_level: int = logging.INFO,
+    file_mode: str = 'w',
+    log_formatter: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+) -> Logger:
    """Initialize and get a logger by name.

    If the logger has not been initialized, this method will initialize the
@@ -22,25 +96,10 @@ def get_logger(name: str,
        log_level (int): The logger level.
        file_mode (str): The file mode used in opening log file.
            Defaults to 'w'.
+        log_formatter (str): The logger output format.
    Returns:
        logging.Logger: The expected logger.
    """
-    # use logger in mmengine if exists.
-    try:
-        from mmengine.logging import MMLogger
-        if MMLogger.check_instance_created(name):
-            logger = MMLogger.get_instance(name)
-        else:
-            logger = MMLogger.get_instance(name,
-                                           logger_name=name,
-                                           log_file=log_file,
-                                           log_level=log_level,
-                                           file_mode=file_mode)
-        return logger
-
-    except Exception:
-        pass
-
    logger = logging.getLogger(name)
    if name in logger_initialized:
        return logger
@@ -56,7 +115,7 @@ def get_logger(name: str,
        if type(handler) is logging.StreamHandler:
            handler.setLevel(logging.ERROR)

-    stream_handler = logging.StreamHandler()
+    stream_handler = logging.StreamHandler(stream=sys.stdout)
    handlers = [stream_handler]

    if log_file is not None:
@@ -66,14 +125,15 @@ def get_logger(name: str,
        file_handler = logging.FileHandler(log_file, file_mode)
        handlers.append(file_handler)

-    formatter = logging.Formatter(
-        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    formatter = ColorFormatter(log_formatter)
    for handler in handlers:
        handler.setFormatter(formatter)
        handler.setLevel(log_level)
+        handler.addFilter(FilterDuplicateWarning(name))
        logger.addHandler(handler)

    logger.setLevel(log_level)
+    logger.propagate = False
    logger_initialized[name] = True

    return logger
@@ -95,3 +155,103 @@ def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str:
        if response.endswith(item):
            response = response[:len(response) - len(item)]
    return response
+
+
+# TODO remove stop_word_offsets stuff and make it clean
+def _stop_words(stop_words: List[str], tokenizer: object):
+    """return list of stop-words to numpy.ndarray."""
+    import numpy as np
+    if stop_words is None:
+        return None
+    assert isinstance(stop_words, List) and \
+        all(isinstance(elem, str) for elem in stop_words), \
+        f'stop_words must be a list but got {type(stop_words)}'
+    stop_indexes = []
+    for stop_word in stop_words:
+        stop_indexes += tokenizer.indexes_containing_token(stop_word)
+    assert isinstance(stop_indexes, List) and all(
+        isinstance(elem, int) for elem in stop_indexes), 'invalid stop_words'
+    # each id in stop_indexes represents a stop word
+    # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
+    # detailed explanation about fastertransformer's stop_indexes
+    stop_word_offsets = range(1, len(stop_indexes) + 1)
+    stop_words = np.array([[stop_indexes, stop_word_offsets]]).astype(np.int32)
+    return stop_words
+
+
+def get_hf_config_content(pretrained_model_name_or_path: str,
+                          **kwargs) -> dict:
+    """Get config content of a hf model."""
+    if os.path.exists(pretrained_model_name_or_path):
+        config_path = os.path.join(pretrained_model_name_or_path,
+                                   'config.json')
+    else:
+        config_path = hf_hub_download(pretrained_model_name_or_path,
+                                      'config.json')
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    return config
+
+
+def get_model(pretrained_model_name_or_path: str,
+              download_dir: str = None,
+              revision: str = None):
+    """Get model from huggingface or modelscope."""
+    import os
+    if os.getenv('LMDEPLOY_USE_MODELSCOPE', 'False').lower() == 'true':
+        from modelscope import snapshot_download
+    else:
+        from huggingface_hub import snapshot_download
+
+    download_kwargs = {}
+    if download_dir is not None:
+        download_kwargs['cache_dir'] = download_dir
+    if revision is not None:
+        download_kwargs['revision'] = revision
+
+    model_path = snapshot_download(pretrained_model_name_or_path,
+                                   **download_kwargs)
+    return model_path
+
+
+def logging_timer(op_name: str, logger: Logger, level: int = logging.DEBUG):
+    """logging timer."""
+
+    @contextmanager
+    def __timer():
+        """timer."""
+        start = time.perf_counter()
+        yield
+        end = time.perf_counter()
+        duration = (end - start) * 1000
+        logger.log(level, f'<{op_name}> take time: {duration:.2f} ms')
+
+    def __inner(func):
+        """inner."""
+
+        @functools.wraps(func)
+        def __func_warpper(*args, **kwargs):
+            """func warpper."""
+            if logger.level > level:
+                return func(*args, **kwargs)
+            with __timer():
+                return func(*args, **kwargs)
+
+        @functools.wraps(func)
+        def __async_warpper(*args, **kwargs):
+            """async warpper."""
+
+            async def __tmp():
+                if logger.level > level:
+                    return (await func(*args, **kwargs))
+                with __timer():
+                    return (await func(*args, **kwargs))
+
+            return __tmp()
+
+        if asyncio.iscoroutinefunction(func):
+            return __async_warpper
+        else:
+            return __func_warpper
+
+    return __inner
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
-__dcu_version__ = '0.1.0'
-__version__ = '0.1.0'
+__dcu_version__ = '0.2.6'
+__version__ = '0.2.6'
 short_version = __version__



--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -3,9 +3,10 @@ m2r==0.2.1
 markdown>=3.4.0
 mistune==0.8.4
 myst-parser
-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+-e git+https://github.com/InternLM/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 recommonmark
 sphinx==4.0.2
 sphinx-copybutton
+sphinx-tabs
 sphinx_markdown_tables>=0.0.16
 sphinxcontrib-mermaid