0.2.6版本新增文件补充

fe851fbc · zhouxiang · e2d98ddc · fe851fbc · fe851fbc · fe851fbc
Commit fe851fbc authored Mar 24, 2024 by zhouxiang
20 changed files
--- a/lmdeploy/legacy/pytorch/adapters/llama2.py
+++ b/lmdeploy/legacy/pytorch/adapters/llama2.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+
+from transformers import PreTrainedTokenizerFast
+
+from lmdeploy.utils import get_logger
+
+from .base import BasicAdapterFast
+
+logger = get_logger(__name__)
+
+B_INST, E_INST = '[INST]', '[/INST]'
+B_SYS, E_SYS = '<<SYS>>\n', '\n<</SYS>>\n\n'
+DEFAULT_SYSTEM_PROMPT = """\
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""   # noqa: E501
+
+
+class Llama2Adapter(BasicAdapterFast):
+    """Adapter for llama2.
+
+    Llama2 use the following template and the first user prompt
+    should contain a system prompt.
+
+    User can specify the system prompt using a <<SYS>> tag otherwise
+    the default system prompt is prepended to user's input.
+
+        <bos>
+        [INST]<space>
+        <<SYS>>\n
+        SYSTEM_PROMPT\n
+        <</SYS>>\n\n
+        {user_prompt_1}<space>
+        [/INST]<space>
+        {answer_1}<space>
+        <eos>
+
+        <bos>
+        [INST]<space>
+        {user_prompt_2}<space>
+        [/INST]<space>
+        {answer_2}<space>
+        <eos>
+
+        <bos>
+        [INST]<space>
+        {user_prompt_2}(no space here)
+        ...
+    """
+
+    start_ids = []
+    sep_ids = []
+
+    def __init__(self, tokenizer: PreTrainedTokenizerFast):
+        super().__init__(tokenizer)
+        self.prev_round = 0
+
+    def encode_and_decorate(self, prompt):
+        r"""Encode prompt and decorate with template."""
+
+        if self.prev_round == 0:
+            res = re.search(r'<<SYS>>(.*?)<</SYS>>(.*)', prompt)
+            if res:
+                prompt = B_SYS + res.group(1).strip() + \
+                    E_SYS + res.group(2).strip()
+            else:
+                prompt = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + prompt
+
+        prompt = f'{B_INST} {prompt.strip()} {E_INST}'
+
+        logger.debug(f'decorated prompt: {repr(prompt)}')
+
+        input_ids = self.tokenizer.encode(
+            prompt,
+            add_special_tokens=True,
+            return_tensors='pt',
+        )
+
+        self.prev_round += 1
+        return input_ids
--- a/lmdeploy/legacy/pytorch/chat.py
+++ b/lmdeploy/legacy/pytorch/chat.py
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Chat through command line.
+
+This submodule allows user to chat with language model through command line,
+and optionally accelerate model using backends like deepspeed.
+
+Example 1: Chat with default setting
+
+```python
+python -m lmdeploy.legacy.pytorch.chat $PATH_TO_HF_MODEL
+```
+
+Example 2: Disable sampling
+
+```python
+python -m lmdeploy.legacy.pytorch.chat \
+    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
+    --temperature 0
+```
+
+Example 3: Accelerate with deepspeed inference
+
+```python
+python -m lmdeploy.legacy.pytorch.chat \
+    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
+    --accel deepspeed
+```
+
+Note: to use deepspeed, you need to install deepspeed,
+    and if hope to accelerate InternLM, you need a customized version
+    https://github.com/wangruohui/DeepSpeed/tree/support_internlm_0.10.0
+
+Example 4: Tensor parallel the model on 2 GPUs
+
+```python
+deepspeed --module --num_gpus 2 lmdeploy.legacy.pytorch.chat \
+    $PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
+    --accel deepspeed \
+```
+
+This module also allow the following control commands to change
+generation behaviors during chat.
+
+- `exit`: terminate and exit chat
+- `config set key=value`: change generation config `key` to `value`,
+    e.g. config temperature=0 disable sampling for following chats
+- `clear`: clear chat history
+"""
+
+import itertools
+import logging
+from typing import Optional
+
+import torch
+from transformers import GenerationConfig, PreTrainedModel
+
+from lmdeploy.utils import get_logger
+
+from .adapters import init_adapter
+from .dist import get_local_rank, get_rank, get_world_size
+from .model import accel_model, init_model
+from .session import BasicSessionManagerWithHistory
+from .utils import BasicStreamer, TerminalIO, control
+
+
+def set_logging(log_file: str, debug: bool):
+    torch.set_printoptions(linewidth=120)
+    level = logging.DEBUG if debug else logging.INFO
+    log_file = log_file or 'chat.log'
+    if r := get_rank() != 0:
+        log_file = log_file + f'.{r}'
+    format = '%(filename)s:   \
+              %(levelname)s:  \
+              %(funcName)s(): \
+              %(lineno)d:\t   \
+              %(message)s'
+
+    logger = get_logger(__name__,
+                        log_file=log_file,
+                        log_level=level,
+                        file_mode='w',
+                        log_formatter=format)
+    print(f'Worker {get_rank()} logging to {log_file}')
+    return logger
+
+
+def main(
+    model_path: str,
+    tokenizer_path: Optional[str] = None,
+    accel: Optional[str] = None,
+    max_new_tokens: int = 128,
+    temperature: float = 0.8,
+    top_p: float = 0.95,
+    seed: int = 0,
+    use_fast_tokenizer: bool = True,
+    max_alloc: int = 2048,
+    max_session_len: int = None,
+    log_file: Optional[str] = None,
+    debug: bool = False,
+    adapter: Optional[str] = None,
+):
+    """Chat with model through terminal.
+
+    Args:
+        model_path (str): Path to model.
+        tokenizer_path (str): Path to tokenizer.
+        accel (str): Model accelerator.
+        max_new_tokens (int): Maximum number of tokens to generate.
+        temperature (float): Temperature for sampling.
+        top_p (float): Top p for sampling.
+        seed (int): Random seed.
+        use_fast_tokenizer (bool): Whether to use fast tokenizer.
+            This argument is directly pass to transformer's ``AutoTokenizer.from_pretrained``.
+            Generally, user should choose to use fast tokenizers.
+            But if using fast raise some error, try to force using a slow one.
+        max_alloc (int): Maximum memory to allocate (for deepspeed).
+        max_session_len (int): Maximum number of tokens allowed for all chat sessions.
+            This include both history and current session.
+        log_file (str): Path to log file.
+        debug (bool): Whether to enable debug mode.
+        adapter (str): Force to use an adapter.
+            Generally user should not use this argument because adapter is selected based
+            on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
+            based on `LlamaforCausalLM` class, this argument is required.
+            Currently, only "llama1" is acceptable for llama1 models.
+    """  # noqa: E501
+    logger = set_logging(log_file, debug)
+
+    # workers should sync in sampling
+    torch.manual_seed(seed)
+
+    local_rank = get_local_rank()
+    world_size = get_world_size()
+
+    # Init model and tokenizer
+    if not tokenizer_path:
+        tokenizer_path = model_path
+
+    model, tokenizer = init_model(
+        model_path,
+        tokenizer_path,
+        use_fast_tokenizer=use_fast_tokenizer,
+    )
+
+    # Init adapter based on model and tokenizer
+    adapter = init_adapter(model, tokenizer, adapter)
+
+    # Accelerate model
+    model: PreTrainedModel = accel_model(model,
+                                         accel,
+                                         max_alloc=max_alloc,
+                                         tp_size=world_size)
+
+    # warmup
+    warmup_config = GenerationConfig(
+        max_new_tokens=1,
+        do_sample=temperature > 0,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    model.generate(torch.tensor([[6]], device=get_local_rank()), warmup_config)
+
+    gen_config = GenerationConfig(
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature > 0,
+        temperature=temperature,
+        top_p=top_p,
+    )
+
+    # Session manager handling history
+    max_session_len = max_alloc if max_session_len is None else max_session_len
+    sm = BasicSessionManagerWithHistory(max_session_len=max_session_len,
+                                        start_ids=adapter.start_ids,
+                                        sep_ids=adapter.sep_ids)
+    io = TerminalIO()
+    streamer = BasicStreamer(adapter.decode, io.output)
+
+    for r in itertools.count(1):
+        # User input from IO
+        logger.info(f'Round {r}')
+
+        prompt: str = io.input()
+        logger.info(f'User input: {prompt}')
+
+        # Allow user to change config during runtime or exit
+        if control(prompt, gen_config, sm):
+            continue
+
+        # Tokenize and apply model specific templates
+        input_ids = adapter.encode_and_decorate(prompt)
+        logger.info(f'Input ids:\n{input_ids}')
+
+        # Prepend chat history (tensor concatenation)
+        input_ids = sm.prepend_history(input_ids)
+        logger.info(f'Input ids with history:\n{input_ids}')
+
+        # Generate
+        input_ids = input_ids.cuda(local_rank)
+        # returned tensor including input and generated output
+        output = model.generate(input_ids,
+                                gen_config,
+                                streamer=streamer,
+                                stopping_criteria=adapter.stopping_criteria)
+        logger.info(f'Output:\n{output}')
+
+        # Save output into session manager and maybe trim some history
+        sm.add_to_history(output)
+
+
+def cli():
+    import fire
+
+    fire.Fire(main)
+
+
+if __name__ == '__main__':
+    cli()
--- a/lmdeploy/legacy/pytorch/decode.py
+++ b/lmdeploy/legacy/pytorch/decode.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import queue
+import warnings
+from typing import List, Optional
+
+import pynvml
+import torch
+import torch.multiprocessing as mp
+from torch.nn.utils.rnn import pad_sequence
+from transformers import (AutoTokenizer, PreTrainedModel,
+                          PreTrainedTokenizerBase)
+
+from lmdeploy.utils import get_logger
+
+from .model import accel_model, init_model
+
+
+def safe_numel(free_mem, model_size, max_intermediate):
+    """Number of elements without out-of-memory."""
+    return int(free_mem - model_size) // max_intermediate
+
+
+def avail_gpus(percentage=0.96):
+    """Detect available gpus.
+
+    Args:
+        percentage (float): The minimum percentage of free memory to be
+            considered as available.
+
+    Return:
+       A list of gpu ids.
+       average free memory on single gpu.
+    """
+
+    gpus = []
+    mems = []
+    pynvml.nvmlInit()
+    for i in range(torch.cuda.device_count()):
+        handle = pynvml.nvmlDeviceGetHandleByIndex(int(i))
+        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        free, total = int(mem_info.free), int(mem_info.total)
+
+        if free / total > percentage:
+            gpus.append(i)
+            mems.append(free)
+    pynvml.nvmlShutdown()
+
+    if len(gpus) == 0:
+        raise RuntimeError('No GPU available.')
+
+    return gpus, sum(mems) / len(mems)
+
+
+@torch.no_grad()
+def decode_single(model: PreTrainedModel,
+                  input_ids: torch.Tensor,
+                  attention_mask: torch.Tensor = None,
+                  return_logits=True):
+    """Decode a single batch.
+
+    Args:
+        model (PreTrainedModel): Pretrained model.
+        input_ids (torch.Tensor): A batch of input ids.
+        attention_mask (torch.Tensor): A batch of attention masks.
+
+    Returns:
+        torch.Tensor: A batch of probabilities (on CPU).
+
+
+    Note:
+        This function assume input_ids[i] = [bos, x1, x2, ..., xn]
+        and return prob = [p(x1|bos), p(x2|bos,x1), ..., p(xn|bos..xn-1)]
+        So prob is shorter than input_ids by 1.
+    """
+
+    # Call Causal LM forward
+    outputs = model(input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    output_hidden_states=False,
+                    output_attentions=False,
+                    use_cache=False,
+                    return_dict=True)
+    # fp32, [bs, seq_len, vocab_size]
+    logits = outputs.logits
+
+    if not return_logits:
+        # inplace softmax to get probs
+        torch.softmax(logits, dim=-1, out=logits)
+
+        # Shift to fetch probabilities
+        shift_labels = input_ids[..., 1:].contiguous()
+        shift_probs = logits[..., :-1, :].contiguous()
+        logits = torch.gather(shift_probs, -1, shift_labels.unsqueeze(-1))
+
+    if attention_mask is not None:
+        logits *= attention_mask[..., None]
+
+    logits = logits.cpu()
+
+    return logits
+
+
+def worker_fn(model_path: str,
+              inq: mp.Queue,
+              outq: mp.Queue,
+              accel: Optional[str] = None,
+              gpu_id=0):
+    # torch.set_default_device(gpu_id)
+    model, _ = init_model(model_path)
+    model = model.eval()
+    model = accel_model(model, accel, gpu_id=gpu_id)
+
+    while True:
+        try:
+            idx, args = inq.get(timeout=1)
+        except queue.Empty:
+            continue
+
+        if idx is None:
+            print(f'Worker {gpu_id} received exit signal.')
+            break
+
+        # print(args)
+        input_ids, input_lens, *args = args
+
+        input_ids = input_ids.cuda(gpu_id)
+        max_len = max(input_lens)
+        assert max_len == input_ids.size(-1), \
+            f'input_ids.shape = {input_ids.shape}, max_len = {max_len}'
+
+        input_lens = torch.tensor(input_lens, device=gpu_id)
+        attention_mask = \
+            torch.arange(max_len, device=gpu_id)[None, :] < input_lens[:, None]
+
+        assert attention_mask.shape == input_ids.shape, \
+            f'attention_mask.shape = {attention_mask.shape}'
+
+        try:
+            probs = decode_single(model, input_ids, attention_mask, *args)
+        except torch.cuda.OutOfMemoryError:
+            warnings.warn(
+                f'OOM on GPU {gpu_id}, discard prompts at indics {idx}.')
+            probs = torch.empty((input_ids.size(0), 0),
+                                dtype=torch.float32,
+                                device='cpu')
+
+        outq.put((idx, probs))
+
+    print(f'Exiting worker {gpu_id} ...')
+    inq.close()
+    outq.close()
+    print(f'Worker {gpu_id} finished.')
+
+
+class Engine:
+    """Multi-GPU deciding engine.
+
+    Args:
+        model_path (str): Path to the pretrained model.
+        tokenizer_path (str, optional): Path to the pretrained tokenizer.
+            Defaults to None.
+            Either tokenizer_path or tokenizer should be provided.
+        tokenizer (PreTrainedTokenizerBase, optional): Pre-configured tokenizer.
+            Defaults to None.
+            Either tokenizer_path or tokenizer should be provided.
+        accel (str, optional): Acceleration method.
+            Defaults to None. 'deepspeed' is not tested.
+        gpu_mem_percentage (float, optional): GPU with memory larger than this value
+            are considered available and be used as decode device.
+            Defaults to 0.96.
+        model_size_byte (float, optional): (Approximate) model size in bytes.
+            Defaults to 14e9 (7B model in FP16).
+        bytes_per_token (float, optional): (Approximate) memory cost per token in bytes.
+            Defaults to 2e6 (2MB).
+            ``bytes_per_token`` and ``model_size_byte`` are used to compute
+            the maximum batch size for given seq_length
+    """  # noqa: E501
+
+    def __init__(self,
+                 model_path: str,
+                 tokenizer_path: Optional[str] = None,
+                 tokenizer: Optional[PreTrainedTokenizerBase] = None,
+                 accel: Optional[str] = None,
+                 gpu_mem_percentage: float = 0.96,
+                 model_size_byte=14e9,
+                 bytes_per_token=2e6):
+
+        gpu_ids, mem = avail_gpus(gpu_mem_percentage)
+        print(f'Available GPUs are: {gpu_ids}, ', end='')
+        print(f'with {mem/2**30:.2f} GiB free.')
+
+        ctx = mp.get_context('spawn')
+        inq = ctx.Queue()
+        outq = ctx.Queue()
+
+        ps = []
+        for id in gpu_ids:
+            p = ctx.Process(target=worker_fn,
+                            args=(model_path, inq, outq, accel, id))
+            p.start()
+            ps.append(p)
+
+        if tokenizer is None:
+
+            if tokenizer_path is None:
+                tokenizer_path = model_path
+
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+        self.gpu_ids = gpu_ids
+        self.inq = inq
+        self.outq = outq
+        self.ps = ps
+        self.tokenizer = tokenizer
+        self.safe_numel = safe_numel(mem, model_size_byte, bytes_per_token)
+
+    def clear_queue(self):
+        for q in self.inq, self.outq:
+            while not q.empty():
+                q.get()
+
+    def decode(self,
+               token_ids: List[List[int]],
+               sort=True,
+               max_bs: int = 1024,
+               pad=True,
+               pad_token_id=2,
+               return_logits=True):
+        """Inference the model to compute probabilities.
+
+        Args:
+            token_ids (List[List[int]]): List of list of token ids.
+            sort (bool, optional): Internally sort the prompts by length to achieve better efficiency.
+                Defaults to True.
+                Note: orders of returned probabilities are always the same as the input.
+            max_bs (int, optional): Maximum batch size.
+                Defaults to 1024.
+            pad (bool, optional): Pad the prompts in every mini batch to the same length.
+                Defaults to True. Set to False to save memory.
+            return_logits (bool, optional): Return logits instead of probabilities.
+
+        Returns:
+            numpy.ndarray: Array of logits of shape [bsz, seqlen, vocab_size],
+                with prob=0 padded, if pad is True
+            List[numpy.ndarray]: List of logits without padding, if pad is False.
+
+        Note:
+            This function will accept input token_ids = [x0(=bos), x1, x2, ..., xn]
+            and compute prob = [p(x1|x0), p(x2|x0,x1), ..., p(xn|x0..xn-1)]
+            So prob is shorter than input_ids by 1.
+        """  # noqa: E501
+
+        self.clear_queue()
+
+        # sort to achieve better efficiency
+        if sort:
+            pids_and_indicis = sorted(enumerate(token_ids),
+                                      key=lambda i_and_x: len(i_and_x[1]))
+        else:
+            pids_and_indicis = list(enumerate(token_ids))
+
+        left = 0
+        bs = max_bs
+
+        while left < len(token_ids):
+
+            if not sort:
+                bs = max_bs
+
+            right = min(left + bs, len(token_ids))
+
+            # batch of prompts
+            sub_p_and_i = pids_and_indicis[left:right]
+            idx, sub_p = zip(*sub_p_and_i)
+
+            # batch of input_ids and attn_masks
+            # inputs = self.tokenizer(sub_p, return_tensors='pt', padding=True)
+            input_ids = [torch.tensor(p) for p in sub_p]
+            input_ids = pad_sequence(input_ids,
+                                     batch_first=True,
+                                     padding_value=pad_token_id)
+            input_lens = [len(p) for p in sub_p]
+
+            # Dynamic batch size based on safe memory
+            while input_ids.numel() > self.safe_numel:
+                if bs == 1:
+                    break
+                bs = max(1, round(bs / 1.5))
+                print(f'\nReduce bs to {bs} when seq len reaches '
+                      f'{input_ids.shape[-1]}')
+                idx = idx[:bs]
+                input_lens = input_lens[:bs]
+                input_ids = input_ids[:bs, :max(input_lens)]
+
+            # Send to worker
+            self.inq.put((idx, (input_ids, input_lens)))
+
+            left += bs
+
+            print(
+                f'Distributing prompts {right}/{len(token_ids)},'
+                f' {right/len(token_ids):.0%}',
+                end='\r')
+
+        print()
+
+        # Collect outputs from workers
+        all_probs = [None] * len(token_ids)
+        count = 0
+
+        while count < len(token_ids):
+            idx, probs = self.outq.get()
+            for i, p in zip(idx, probs):
+                assert all_probs[i] is None
+                all_probs[i] = p
+
+            count += len(idx)
+            print(
+                f'Decoding and collecting outputs '
+                f'{count}/{len(token_ids)}, '
+                f'{count/len(token_ids):.0%}',
+                end='\r')
+
+        print()
+
+        if pad:
+            all_probs = pad_sequence(all_probs, batch_first=True)
+            all_probs = all_probs.cpu().numpy()
+        else:
+            all_probs = [p.cpu().numpy() for p in all_probs]
+
+        return all_probs
+
+    def __del__(self):
+        print('Exiting engine ...')
+        for _ in self.ps:
+            self.inq.put((None, None))
+        for p in self.ps:
+            p.join(timeout=1)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path',
+                        default='llama2/huggingface/llama-2-7b',
+                        help='Path to HugigngFace model and tokenizer.')
+    parser.add_argument(
+        '--test_path',
+        default='',
+        help='Path to text file, with each line containing a prompt.')
+    parser.add_argument(
+        '-p',
+        '--prompts',
+        nargs='*',
+        default=[
+            'I believe the meaning of life is to find your gift.',
+            'Simply put, the theory of relativity states that',
+            'Building a website can be done in 10 simple steps:'
+        ],
+        help="Prompt in command line, please quote \"\" every sentences, "
+        'surpassed by --test_path')
+    parser.add_argument('--min_len',
+                        default=1,
+                        help='Minimum length of prompts')
+    parser.add_argument('--save-to',
+                        default='decode.out',
+                        help='Save results to this file.')
+    args = parser.parse_args()
+
+    model_path = args.model_path
+    test_path = args.test_path
+    prompts = args.prompts
+
+    logger = get_logger(__name__)
+    # logging.basicConfig(level=logging.DEBUG)
+
+    # Use test file preferentially
+    if test_path:
+        with open(test_path, 'r') as f:
+            prompts = f.readlines()
+
+    prompts = [p.strip() for p in prompts]
+
+    # Output infos
+    print(f'Model path: {model_path}')
+
+    def _format(ts, start, end):
+        if start < 0:
+            start += len(ts)
+        if end <= 0:
+            end += len(ts)
+        return '\n'.join(
+            (f'{i}\t{t}' for i, t in zip(range(start, end), ts[start:end])))
+
+    if len(prompts) > 10:
+        print('Prompts:\n' + _format(prompts, 0, 5) + '\n......\n' +
+              _format(prompts, -5, 0))
+    else:
+        print('Prompts:\n' + _format(prompts, 0, 0))
+
+    # Init Engine in backend
+    engine = Engine(model_path)
+
+    # Tokenize
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+    tokenizer.padding_side = 'right'
+
+    input_ids = tokenizer(prompts, padding=False)
+    input_ids: List[List[int]] = input_ids.input_ids
+
+    # Filter out too short prompts
+    input_ids = [i for i in input_ids if len(i) >= args.min_len]
+    if len(input_ids) < len(prompts):
+        logger.warning(
+            f'Filtered out {len(prompts) - len(input_ids)} prompts, '
+            f'because they are shorter than {args.min_len}.')
+
+    # Decode
+    logits = engine.decode(input_ids)
+
+    print(f'logits.shape = {logits.shape}')
+    # Save to pth
+    print(f'Dumping results to = {args.save_to}')
+
+    torch.save(logits, args.save_to, pickle_protocol=4)
+
+    del engine
--- a/lmdeploy/legacy/pytorch/dist.py
+++ b/lmdeploy/legacy/pytorch/dist.py
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Helpers for parallel and distributed inference."""
+
+import functools
+import os
+
+import torch
+from torch.distributed import broadcast, broadcast_object_list, is_initialized
+
+
+def get_local_rank():
+    """Get local rank of current process.
+
+    Assume environment variable ``LOCAL_RANK`` is properly set by some launcher.
+    See: https://pytorch.org/docs/stable/elastic/run.html#environment-variables
+    """  # noqa: E501
+
+    return int(os.getenv('LOCAL_RANK', '0'))
+
+
+def get_rank():
+    """Get rank of current process.
+
+    Assume environment variable ``RANK`` is properly set by some launcher.
+    See: https://pytorch.org/docs/stable/elastic/run.html#environment-variables
+    """  # noqa: E501
+
+    return int(os.getenv('RANK', '0'))
+
+
+def get_world_size():
+    """Get rank of current process.
+
+    Assume environment variable ``WORLD_SIZE`` is properly set by some launcher.
+    See: https://pytorch.org/docs/stable/elastic/run.html#environment-variables
+    """  # noqa: E501
+
+    return int(os.getenv('WORLD_SIZE', '1'))
+
+
+def master_only(func):
+    """Decorator to run a function only on the master process."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_initialized():
+            if get_rank() != 0:
+                return None
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def master_only_and_broadcast_general(func):
+    """Decorator to run a function only on the master process and broadcast the
+    result to all processes."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_initialized():
+            if get_rank() == 0:
+                result = [func(*args, **kwargs)]
+            else:
+                result = [None]
+            broadcast_object_list(result, src=0)
+            result = result[0]
+        else:
+            result = func(*args, **kwargs)
+        return result
+
+    return wrapper
+
+
+def master_only_and_broadcast_tensor(func):
+    """Decorator to run a function only on the master process and broadcast the
+    result to all processes.
+
+    Note: Require CUDA tensor.
+    Note: Not really work because we don't know the shape aforehand,
+          for cpu tensors, use master_only_and_broadcast_general
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, size, dtype, **kwargs):
+        if is_initialized():
+            if get_rank() == 0:
+                result = func(*args, **kwargs)
+            else:
+                result = torch.empty(size=size,
+                                     dtype=dtype,
+                                     device=get_local_rank())
+            broadcast(result, src=0)
+            # print(f'rank {get_rank()} received {result}')
+        else:
+            result = func(*args, **kwargs)
+        return result
+
+    return wrapper
--- a/lmdeploy/legacy/pytorch/model.py
+++ b/lmdeploy/legacy/pytorch/model.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+import warnings
+from typing import Optional
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from lmdeploy.utils import get_logger
+
+from .dist import get_local_rank
+
+logger = get_logger(__name__)
+
+
+class LoadWoInit:
+    """Context manager that disable parameter initialization."""
+
+    def __init__(self):
+        self.constant_ = torch.nn.init.constant_
+        self.zeros_ = torch.nn.init.zeros_
+        self.ones_ = torch.nn.init.ones_
+        self.uniform_ = torch.nn.init.uniform_
+        self.normal_ = torch.nn.init.normal_
+        self.kaiming_uniform_ = torch.nn.init.kaiming_uniform_
+        self.kaiming_normal_ = torch.nn.init.kaiming_normal_
+
+    def __enter__(self, *args, **kwargs):
+        torch.nn.init.constant_ = lambda *args, **kwargs: None
+        torch.nn.init.zeros_ = lambda *args, **kwargs: None
+        torch.nn.init.ones_ = lambda *args, **kwargs: None
+        torch.nn.init.uniform_ = lambda *args, **kwargs: None
+        torch.nn.init.normal_ = lambda *args, **kwargs: None
+        torch.nn.init.kaiming_uniform_ = lambda *args, **kwargs: None
+        torch.nn.init.kaiming_normal_ = lambda *args, **kwargs: None
+
+    def __exit__(self, *args, **kwargs):
+        torch.nn.init.constant_ = self.constant_
+        torch.nn.init.zeros_ = self.zeros_
+        torch.nn.init.ones_ = self.ones_
+        torch.nn.init.uniform_ = self.uniform_
+        torch.nn.init.normal_ = self.normal_
+        torch.nn.init.kaiming_uniform_ = self.kaiming_uniform_
+        torch.nn.init.kaiming_normal_ = self.kaiming_normal_
+
+
+def init_model(model_path: str,
+               tokenizer_path: Optional[str] = None,
+               use_fast_tokenizer=True):
+    """Initialize model and tokenizer from given model path.
+
+    Args:
+        model_path (str): Path to model.
+        tokenizer_path (str): Path to tokenizer.
+        use_fast_tokenizer (bool): Whether to use fast tokenizer.
+
+    Note:
+        If the model is converted from new version of transformers,
+            use_fast_tokenizer should be True.
+        If using depodaca/llama-xb-hf, use_fast_tokenizer should be False.
+    """
+
+    start = time.monotonic()
+
+    if not tokenizer_path:
+        tokenizer_path = model_path
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,
+                                              use_fast=use_fast_tokenizer,
+                                              trust_remote_code=True)
+
+    with LoadWoInit():
+        model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                     torch_dtype=torch.float16,
+                                                     trust_remote_code=True)
+
+    logger.info(f'Model loaded in {time.monotonic() - start:.1f} seconds')
+    logger.info(f'Model loaded from {model_path}')
+    logger.debug(model)
+
+    return model, tokenizer
+
+
+def accel_model(model,
+                accel: Optional[str] = None,
+                gpu_id=None,
+                max_alloc=2048,
+                tp_size=1):
+    """Accelerate model with given accelerator.
+
+    Note:
+        Currently we support only deepspeed or just no acceleration.
+    """
+
+    logger.info(f'Accelerate model with {accel}')
+
+    if accel is None:
+        # No acceleration, just to cuda
+        # assume single gpu single process
+        # user is responsible to assign the gpu id via CUDA_VISIBLE_DEVICES # noqa: E501
+        gpu_id = gpu_id if gpu_id is not None else get_local_rank()
+        model = model.cuda(gpu_id)
+
+    elif accel.lower() == 'deepspeed':
+        # Use deepspeed inference inject fast kernel and/or tensor parallel
+
+        try:
+            import deepspeed
+        except ImportError as e:
+            raise ImportError('--accel=deepspeed is specified but '
+                              'deepspeed is not installed.\n'
+                              'Install with `pip install deepspeed`.') from e
+
+        config = dict(
+            tensor_parallel=dict(tp_size=tp_size),  # Use world size in general
+            dtype=torch.float16,
+            replace_with_kernel_inject=True,
+            max_out_tokens=max_alloc,
+        )
+
+        if 'InternLM' in model.__class__.__name__:
+            try:
+                # Use customized deepspeed supporting InternLM
+                # https://github.com/wangruohui/DeepSpeed/tree/support_internlm_0.10.0 (commit cdef2ce)  # noqa: E501
+                from deepspeed.module_inject.containers.internlm import \
+                    InternLMLayerPolicy  # noqa: E501
+            except ImportError:
+                # InternLM is not officially supported by DeepSpeed
+                # Set replace_with_kernel_inject=False to use AutoTP
+                config.update({'replace_with_kernel_inject': False})
+                warnings.warn(
+                    '\033[0;93m'
+                    'Current installation of deepspeed does not '
+                    'support InternLM. Disable kernel injection. '
+                    'To support InternLM, install customized deepspeed with '
+                    '`pip install git+https://github.com/wangruohui/DeepSpeed@support_internlm_0.10.0`'  # noqa: E501
+                    '\033[0m')
+            else:
+                for module in model.modules():
+                    # Since remote code is dynamically located,
+                    # we need to do this dynamically
+                    if module.__class__.__name__ == 'InternLMDecoderLayer':
+                        InternLMLayerPolicy._orig_layer_class = module.__class__  # noqa: E501
+                        break
+
+        logger.debug(f'Using deepspeed config\n{config}')
+
+        model = deepspeed.init_inference(
+            model=model,  # Transformers models
+            config=config,
+        )
+
+        # for k, v in model.named_parameters():
+        #     logger.debug(f"{k}: v.device")
+    else:
+        raise ValueError(f'Unsupported accelerator {accel}.')
+
+    logger.debug(model)
+
+    return model
--- a/lmdeploy/legacy/pytorch/modules/__init__.py
+++ b/lmdeploy/legacy/pytorch/modules/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .linear import WeightOnlyQLinear
+
+__all__ = ['WeightOnlyQLinear']
--- a/lmdeploy/legacy/pytorch/modules/linear.py
+++ b/lmdeploy/legacy/pytorch/modules/linear.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Type, TypeVar
+
+import torch
+from torch import nn
+
+try:
+    import awq_inference_engine
+except ModuleNotFoundError:
+    awq_inference_engine = None
+
+
+class WeightOnlyQLinear(nn.Module):
+    """This class implements weight only quantization linear.
+
+    Args:
+        w_bit (int): number of bits for quantization.
+        symmetry (bool): If true, use symmetric quantization,
+            otherwise use asymmetric quantization.
+        group_size (int): size of the quantization group.
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (Tensor, optional): Defaults to None.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: Optional[torch.Tensor] = True,
+        w_bit: int = 4,
+        symmetry: bool = False,
+        group_size: int = 128,
+    ) -> None:
+        super().__init__()
+
+        if w_bit not in [2, 4, 8]:
+            raise NotImplementedError('Only 2,4,8 bit are supported for now.')
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else in_features
+
+        assert self.in_features % self.group_size == 0
+        assert out_features % (32 // self.w_bit) == 0
+
+        w_pack_oc = out_features // (32 // self.w_bit)
+        w_inc = in_features
+        weight = torch.zeros((w_inc, w_pack_oc), dtype=torch.int32)
+        self.register_buffer('qweight', weight)
+
+        if bias:
+            self.register_buffer('bias', torch.zeros(out_features))
+        else:
+            self.bias = None
+
+        s_inc = in_features // self.group_size
+        s_oc = out_features
+        scales = torch.zeros((s_inc, s_oc), dtype=torch.float16)
+        self.register_buffer('scales', scales)
+
+        if not symmetry:
+            z_inc = in_features // self.group_size
+            z_oc = out_features // (32 // self.w_bit)
+            zeros = torch.zeros((z_inc, z_oc), dtype=torch.int32)
+            self.register_buffer('qzeros', zeros)
+        else:
+            self.qzeros = None
+
+    @classmethod
+    def from_linear(cls: Type['WeightOnlyQLinear'],
+                    linear: nn.Linear,
+                    quantizer: TypeVar('Quantizer'),
+                    awq_layout: bool = True) -> 'WeightOnlyQLinear':
+        """Create a WeightOnlyQLinear object from a PyTorch Linear object.
+
+        Args:
+            linear (nn.Linear): PyTorch Linear object.
+            quantizer (Quantizer): Object that handles quantization.
+            awq_layout (bool): AWQ layout. Defaults to True.
+
+        Returns:
+            WeightOnlyQLinear: A WeightOnlyQLinear object.
+        """
+        device = linear.weight.device
+
+        w_bit = quantizer.bits
+        pack_num = 32 // w_bit
+        if awq_layout:
+            assert w_bit == 4
+            pack_order = [0, 2, 4, 6, 1, 3, 5, 7]
+        else:
+            pack_order = torch.arange(pack_num)
+        group_size = quantizer.group_size
+        symmetry = quantizer.symmetry
+
+        in_features = linear.in_features
+        out_features = linear.out_features
+        bias = False if linear.bias is None else True
+
+        qlinear = cls(in_features, out_features, bias, w_bit, symmetry,
+                      group_size)
+        qlinear.bias = linear.bias
+
+        qparams = quantizer.calculate_qparams(linear.weight)
+        i32_w = quantizer.quant(linear.weight, qparams, real=True)
+        i32_w = i32_w.t().contiguous()
+
+        pack_int_w = torch.zeros_like(qlinear.qweight).to(device)
+
+        for col in range(pack_int_w.shape[1]):
+            for i in range(pack_num):
+                pack_int_w_col = i32_w[:, col * pack_num + pack_order[i]]
+                pack_int_w[:, col] |= pack_int_w_col << (i * w_bit)
+
+        qlinear.qweight = pack_int_w
+        qlinear.scales = qparams.scales.squeeze(-1).t().contiguous()
+
+        if qparams.zero_points is not None:
+            zeros = qparams.zero_points.to(torch.int32).to(device)
+            zeros = zeros.squeeze(-1).t().contiguous()
+            pack_int_zeros = torch.zeros_like(qlinear.qzeros).to(device)
+
+            for col in range(pack_int_zeros.shape[1]):
+                for i in range(pack_num):
+                    qzero_col = zeros[:, col * pack_num + pack_order[i]]
+                    pack_int_zeros[:, col] |= qzero_col << (i * w_bit)
+            qlinear.qzeros = pack_int_zeros
+
+        qlinear.to('cpu')
+
+        return qlinear
+
+    @torch.no_grad()
+    def forward(self, x):
+        if awq_inference_engine is None:
+            raise RuntimeError(
+                'Run the following command to install '
+                'the kernel for 4bit inference\n\n'
+                'git clone https://github.com/mit-han-lab/llm-awq.git\n'
+                'cd awq/kernels\n'
+                'python setup.py install\n')
+        out_shape = x.shape[:-1] + (self.out_features, )
+        inputs = x.reshape(-1, x.shape[-1])
+
+        out = awq_inference_engine.gemm_forward_cuda(inputs.half(),
+                                                     self.qweight,
+                                                     self.scales.half(),
+                                                     self.qzeros,
+                                                     self.group_size)
+        out = out + self.bias if self.bias is not None else out
+
+        return out.reshape(out_shape)
--- a/lmdeploy/legacy/pytorch/session.py
+++ b/lmdeploy/legacy/pytorch/session.py
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+from transformers.generation.utils import ModelOutput
+
+from lmdeploy.utils import get_logger
+
+logger = get_logger(__name__)
+
+
+class BasicSessionManager:
+    """Basic session manager without history."""
+
+    def prepend_history(self, input_ids):
+        return input_ids
+
+    def add_to_history(self, output):
+        pass
+
+
+class BasicSessionManagerWithHistory:
+    """Basic session manager with chat history.
+
+    Args:
+        max_session_len (int): Maximum number of tokens allowed for all chat sessions.
+        reduce_size (int): Number of tokens to be trimmed when reaching maximum
+            session length. Default: 256.
+        start_ids (list[int]): Sequences of ids at the start of the chat session.
+        sep_ids (list[int]): Sequences of ids separating chat sessions.
+    """   # noqa: E501
+    bs = 1
+
+    def __init__(self,
+                 max_session_len=2048,
+                 reduce_size=256,
+                 start_ids=[1],
+                 sep_ids=[13]) -> None:
+
+        self.start_ids = torch.tensor(start_ids, dtype=torch.long)
+        self.sep_ids = torch.tensor(sep_ids, dtype=torch.long)
+
+        assert self.start_ids.ndim == 1
+        assert self.sep_ids.ndim == 1
+
+        self.max_session_len = max(len(start_ids), max_session_len)
+        self.reduce_size = min(reduce_size, max_session_len - len(start_ids))
+
+        assert self.max_session_len > self.reduce_size
+
+        self.new_session()
+
+    def new_session(self):
+        self.history_ids = self.start_ids.repeat(self.bs, 1)
+
+    def prepend_history(self, input_ids: torch.Tensor):
+        """Prepend history ids to input ids and trim if over-length."""
+
+        input_ids = input_ids.to(self.history_ids.device).long()
+        sep_ids = self.sep_ids.to(self.history_ids.device).long().repeat(1, 1)
+        input_ids = torch.cat([self.history_ids, sep_ids, input_ids], dim=1)
+
+        if input_ids.shape[1] > self.max_session_len:
+            input_ids = input_ids[:,
+                                  (self.reduce_size - self.max_session_len):]
+            input_ids[:, :len(self.start_ids)] = self.start_ids.repeat(
+                self.bs, 1)
+        return input_ids
+
+    def add_to_history(self, output):
+        """Save history output ids.
+
+        Note:
+            Output returned by HuggingFace generator contains both input
+            and output ids.
+        """
+
+        if isinstance(output, ModelOutput):
+            self.history_ids = output.sequences
+        elif isinstance(output, torch.Tensor):
+            self.history_ids = output
+        else:
+            raise ValueError(f'Unknown output type {type(output)}')
--- a/lmdeploy/legacy/pytorch/utils.py
+++ b/lmdeploy/legacy/pytorch/utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from transformers.generation.streamers import BaseStreamer
+
+from lmdeploy.utils import get_logger
+
+from .dist import get_rank, master_only, master_only_and_broadcast_general
+
+try:
+    import readline  # To support command line history # noqa: F401
+except ImportError:  # readline not available
+    pass
+
+logger = get_logger(__name__)
+
+
+class TerminalIO:
+    """Terminal input and output."""
+
+    end_of_output = '\n'
+
+    @master_only_and_broadcast_general
+    def input(self):
+        """Read input from terminal."""
+
+        print('\ndouble enter to end input >>> ', end='')
+        sentinel = ''  # ends when this string is seen
+        try:
+            return '\n'.join(iter(input, sentinel))
+        except EOFError:
+            print('Detect EOF, exit')
+            exit()
+
+    @master_only
+    def output(self, string):
+        """Output to terminal with flush."""
+
+        print(string, end='', flush=True)
+
+
+class BasicStreamer(BaseStreamer):
+    """Basic streamer for HuggingFace models."""
+
+    def __init__(self,
+                 decode_func,
+                 output_func,
+                 end_of_output='\n',
+                 skip_prompt=True):
+        self.decode = decode_func
+        self.output = output_func
+        self.end_of_output = end_of_output
+        self.skip_prompt = skip_prompt
+        self.gen_len = 0
+
+    def put(self, value):
+        """Callback before forwarding current token id to model."""
+
+        if self.gen_len == 0 and self.skip_prompt:
+            pass
+        else:
+            token = self.decode(value)
+            self.output(token)
+
+        self.gen_len += 1
+
+    def end(self):
+        """Callback at the end of generation."""
+        self.output(self.end_of_output)
+        self.gen_len = 0
+
+
+def control(prompt, gen_config, sm):
+    """Allow user to control generation config and session manager.
+
+    Return:
+        True if control command applied, False otherwise.
+    """
+
+    if prompt == 'exit':
+        exit(0)
+
+    if prompt == 'clear':
+        sm.new_session()
+        logger.info('Session cleared')
+        return True
+
+    # Re-config during runtime
+    if prompt.startswith('config set'):
+        try:
+            keqv = prompt.split()[-1]
+            k, v = keqv.split('=')
+            v = eval(v)
+            gen_config.__setattr__(k, v)
+            logger.info(f'Worker {get_rank()} set {k} to {repr(v)}')
+            logger.info(f'Generator config changed to: {gen_config}')
+
+            return True
+        except:  # noqa
+            logger.info(
+                'illegal instruction, treated as normal conversation. ')
+
+    return False
--- a/lmdeploy/lite/apis/smooth_quant.py
+++ b/lmdeploy/lite/apis/smooth_quant.py
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import os.path as osp
+import shutil
+
+import fire
+import torch
+from torch import nn
+
+import lmdeploy
+from lmdeploy.lite.apis.calibrate import calibrate
+from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
+                                            smooth_layers)
+from lmdeploy.lite.utils import collect_target_modules
+from lmdeploy.pytorch.models import QLinear, QRMSNorm
+
+LAYER_TYPE_MAP = {
+    'InternLMForCausalLM': 'InternLMDecoderLayer',
+    'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
+    'QWenLMHeadModel': 'QWenBlock',
+    'BaiChuanForCausalLM': 'DecoderLayer',
+    'LlamaForCausalLM': 'LlamaDecoderLayer',
+}
+NORM_TYPE_MAP = {
+    'InternLMForCausalLM': 'InternLMRMSNorm',
+    'InternLM2ForCausalLM': 'InternLM2RMSNorm',
+    'QWenLMHeadModel': 'RMSNorm',
+    'BaiChuanForCausalLM': 'RMSNorm',
+    'LlamaForCausalLM': 'LlamaRMSNorm',
+}
+
+LMDEPLOY_ROOT = lmdeploy.__path__[0]
+
+MODEL_PATH_MAP = {
+    'InternLMForCausalLM':
+    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm.py'),
+    'InternLM2ForCausalLM':
+    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm2.py'),
+    'LlamaForCausalLM':
+    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_llama.py'),
+    'BaiChuanForCausalLM':
+    osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_baichuan.py')
+}
+
+AUTO_MAP = {
+    'InternLMForCausalLM': {
+        'AutoConfig': 'configuration_internlm.InternLMConfig',
+        'AutoModel': 'modeling_internlm.InternLMForCausalLM',
+        'AutoModelForCausalLM': 'modeling_internlm.InternLMForCausalLM'
+    },
+    'InternLM2ForCausalLM': {
+        'AutoConfig': 'configuration_internlm2.InternLMConfig',
+        'AutoModelForCausalLM': 'modeling_internlm2.InternLM2ForCausalLM',
+        'AutoModel': 'modeling_internlm2.InternLM2ForCausalLM'
+    },
+    'LlamaForCausalLM': {
+        'AutoModel': 'modeling_llama.LlamaForCausalLM',
+        'AutoModelForCausalLM': 'modeling_llama.LlamaForCausalLM'
+    },
+    'BaiChuanForCausalLM': {
+        'AutoConfig': 'configuration_baichuan.BaiChuanConfig',
+        'AutoModelForCausalLM': 'modeling_baichuan.BaiChuanForCausalLM'
+    }
+}
+
+
+def smooth_quant(model: str,
+                 work_dir: str = './work_dir',
+                 calib_dataset: str = 'ptb',
+                 calib_samples: int = 128,
+                 calib_seqlen: int = 2048,
+                 device: str = 'cuda'):
+
+    model, tokenizer, work_dir = calibrate(model, calib_dataset, calib_samples,
+                                           calib_seqlen, work_dir, device)
+
+    # calibrate function exports the calibration statistics
+    # (inputs, outputs, keys and values) to `work_dir`.
+    inp_stats = torch.load(work_dir / 'inputs_stats.pth')
+    act_scales = inp_stats['absmax']
+
+    model_type = type(model).__name__
+    if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
+        raise RuntimeError(
+            f'Currently, quantification and calibration of {model_type} are '
+            f'not supported. The supported model types are '
+            f"{', '.join(LAYER_TYPE_MAP.keys())}.")
+
+    if model_type == 'QWenLMHeadModel':
+        try:
+            import flash_attn  # noqa: F401
+        except ImportError:
+            raise RuntimeError(
+                'When using Qwen, you need to `pip install flash-attn` first, '
+                'otherwise calibration and quantification will not work '
+                'properly.')
+
+    layer_type = LAYER_TYPE_MAP[type(model).__name__]
+    norm_type = NORM_TYPE_MAP[type(model).__name__]
+    fc2fcs = FC_FCS_MAP[layer_type]
+    norm2fcs = NORM_FCS_MAP[layer_type]
+
+    layers = collect_target_modules(model, layer_type)
+    fcs = {}
+    for l_name, layer in layers.items():
+        name2fc = collect_target_modules(layer, nn.Linear, prefix=l_name)
+        fcs.update(name2fc)
+
+    smooth_layers(layers, fc2fcs, norm2fcs, act_scales, -1, device)
+
+    rmsnorms = collect_target_modules(model, norm_type)
+
+    for name, linear in fcs.items():
+        linear.to(device)
+        q_linear = QLinear.from_float(linear)
+        parent_name, _, child_name = name.rpartition('.')
+        parent = model.get_submodule(parent_name)
+        setattr(parent, child_name, q_linear)
+        linear.to('cpu')
+
+    for name, norm in rmsnorms.items():
+        norm.to(device)
+        q_norm = QRMSNorm.from_float(norm)
+        parent_name, _, child_name = name.rpartition('.')
+        parent = model.get_submodule(parent_name)
+        setattr(parent, child_name, q_norm)
+        norm.to('cpu')
+
+    if hasattr(model.config, 'auto_map'):
+        model.config.auto_map.update(AUTO_MAP[type(model).__name__])
+    else:
+        model.config.auto_map = AUTO_MAP[type(model).__name__]
+
+    model.save_pretrained(work_dir,
+                          max_shard_size='2GB',
+                          safe_serialization=False)
+    tokenizer.save_pretrained(work_dir)
+
+    shutil.copy(MODEL_PATH_MAP[type(model).__name__], work_dir)
+
+
+if __name__ == '__main__':
+    fire.Fire(smooth_quant)
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import enum
+from dataclasses import dataclass
+from typing import Dict, List, Literal, Optional
+
+from pydantic.dataclasses import dataclass as pydantic_dataclass
+
+from .tokenizer import Tokenizer
+
+
+@dataclass
+class GenerationConfig:
+    """generation parameters used by inference engines.
+
+    Args:
+        n (int): Define how many chat completion choices to generate for each
+            input message
+        max_new_tokens (int): The maximum number of tokens that can be
+            generated in the chat completion
+        top_p (float): An alternative to sampling with temperature, called
+            nucleus sampling, where the model considers the results of the
+            tokens with top_p probability mass
+        top_k (int): An alternative to sampling with temperature, where
+            the model considers the top_k tokens with the highest probability
+        temperature (float): Sampling temperature
+        repetition_penalty (float): Penalty to prevent the model from
+            generating repeated words or phrases. A value larger than
+            1 discourages repetition
+        ignore_eos (bool): Indicator to ignore the eos_token_id or not
+        random_seed (int): Seed used when sampling a token
+        stop_words (List[str]): Words that stop generating further tokens
+        bad_words (List[str]): Words that the engine will never generate
+        min_new_tokens (int): The minimum numbers of tokens to generate,
+            ignoring the number of tokens in the prompt.
+        skip_special_tokens (bool): Whether or not to remove special tokens
+            in the decoding. Default to be True.
+    """
+
+    n: int = 1
+    max_new_tokens: int = 512
+    top_p: float = 1.0
+    top_k: int = 1
+    temperature: float = 0.8
+    repetition_penalty: float = 1.0
+    ignore_eos: bool = False
+    random_seed: int = None
+    stop_words: List[str] = None
+    bad_words: List[str] = None
+    min_new_tokens: int = None
+    skip_special_tokens: bool = True
+
+
+@dataclass
+class EngineGenerationConfig(GenerationConfig):
+    """generation parameter used by the inference engines."""
+    stop_words: List[int] = None
+    bad_words: List[int] = None
+
+    @staticmethod
+    def From(gen_config: GenerationConfig, tokenizer: Tokenizer):
+        """convert `GenerationConfig` to `EngineGenerationConfig`
+        Args:
+            gen_config (GenerationConfig): an instance of class `GenerationConfig`
+            tokenizer (Tokenizer): a tokenizer to encode the `stop_words` and `bad_words` in `gen_config`
+
+        Returns:
+            EngineGenerationConfig: the generation config used by inference engines
+
+        Examples:
+            >>> from lmdeploy import Tokenizer, GenerationConfig, EngineGenerationConfig
+            >>> tokenizer = Tokenizer('internlm/internlm-chat-7b')
+            >>> gen_config = GenerationConfig(stop_words=['<eoa>'])
+            >>> gen_config = EngineGenerationConfig.From(gen_config, tokenizer)
+        """  # noqa E501
+
+        def special_word_token_ids(words):
+            if words is not None:
+                assert isinstance(words, List) and \
+                    all(isinstance(elem, str) for elem in words), \
+                    f'stop_words must be a list of str but got {type(words)}'
+                indexes = []
+                for word in words:
+                    indexes += tokenizer.indexes_containing_token(word)
+                return indexes
+            return None
+
+        return EngineGenerationConfig(
+            n=gen_config.n,
+            max_new_tokens=gen_config.max_new_tokens,
+            min_new_tokens=gen_config.min_new_tokens,
+            top_p=gen_config.top_p,
+            top_k=gen_config.top_k,
+            temperature=gen_config.temperature,
+            repetition_penalty=gen_config.repetition_penalty,
+            ignore_eos=gen_config.ignore_eos,
+            random_seed=gen_config.random_seed,
+            skip_special_tokens=gen_config.skip_special_tokens,
+            stop_words=special_word_token_ids(gen_config.stop_words),
+            bad_words=special_word_token_ids(gen_config.bad_words))
+
+
+@pydantic_dataclass
+class TurbomindEngineConfig:
+    """TurboMind Engine config.
+
+    Args:
+        model_name (str): the name of the deployed model, deprecated and has no effect when version > 0.2.1
+        model_format (str): the layout of the deployed model. It can be one of the following values [hf, llama, awq], `hf` meaning `hf_llama`, `llama` meaning `meta_llama`, `awq` meaning the quantized model by AWQ.
+        tp (int): the number of GPU cards used in tensor parallelism, default to 1
+        session_len (int): the max session length of a sequence, default to None
+        max_batch_size (int): the max batch size during inference, default to 128
+        cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
+            For versions of lmdeploy between `v0.2.0` and `v0.2.1`, it defaults to 0.5, depicting the percentage of TOTAL GPU memory to be allocated to the k/v cache.
+            For lmdeploy versions greater than `v0.2.1`, it defaults to 0.8, signifying the percentage of FREE GPU memory to be reserved for the k/v cache
+        quant_policy (int): , default to 0. When k/v is quantized into 8 bit, set it to 4
+        rope_scaling_factor (int): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
+        use_logn_attn (bool): whether or not to use log attn: default to False
+        download_dir (str): Directory to download and load the weights, default to the default cache directory of huggingface.
+        revision (str): The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+        max_prefill_token_num(int): the number of tokens each iteration during prefill, default to 8192
+    """  # noqa: E501
+
+    model_name: Optional[str] = None
+    model_format: Optional[str] = None
+    tp: int = 1
+    session_len: Optional[int] = None
+    max_batch_size: int = 128
+    cache_max_entry_count: float = 0.8
+    quant_policy: int = 0
+    rope_scaling_factor: float = 0.0
+    use_logn_attn: bool = False
+    download_dir: Optional[str] = None
+    revision: Optional[str] = None
+    max_prefill_token_num: int = 8192
+
+
+@dataclass
+class PytorchEngineConfig:
+    """PyTorch Engine Config.
+
+    Args:
+        model_name (str): name of the given model.
+        tp (int): Tensor Parallelism. default 1.
+        session_len (int): Max session length. Default None.
+        max_batch_size (int): Max batch size. Default 128.
+        cache_max_entry_count (float): the percentage of gpu memory occupied
+            by the k/v cache. For lmdeploy versions greater than `v0.2.1`,
+            it defaults to 0.8, signifying the percentage of FREE GPU memory
+            to be reserved for the k/v cache
+        eviction_type (str): What action to perform when kv cache
+            is full, ['recompute', 'copy'], Default 'recompute'.
+        prefill_interval (int): Interval to perform prefill,
+            Default 16.
+        block_size (int): paging cache block size, default 64.
+        num_cpu_blocks (int): Num cpu blocks. If num is 0, cache
+            would be allocate according to current environment.
+        num_gpu_blocks (int): Num gpu blocks. If num is 0, cache
+            would be allocate according to current environment.
+        adapters (dict): The path configs to lora adapters.
+        max_prefill_token_num (int): tokens per iteration.
+        thread_safe (bool): thread safe engine instance.
+        download_dir (str): Directory to download and load the weights,
+            default to the default cache directory of huggingface.
+        revision (str): The specific model version to use.
+            It can be a branch name, a tag name, or a commit id.
+            If unspecified, will use the default version.
+    """
+    model_name: str = ''
+    tp: int = 1
+    session_len: int = None
+    max_batch_size: int = 128
+    cache_max_entry_count: float = 0.8
+    eviction_type: str = 'recompute'
+    prefill_interval: int = 16
+    block_size: int = 64
+    num_cpu_blocks: int = 0
+    num_gpu_blocks: int = 0
+    adapters: Dict[str, str] = None
+    max_prefill_token_num: int = 4096
+    thread_safe: bool = False
+    download_dir: str = None
+    revision: str = None
+
+
+class ResponseType(enum.Enum):
+    """Response type."""
+
+    SUCCESS = enum.auto()
+    FINISH = enum.auto()
+    ENGINE_STOP_ERROR = enum.auto()
+    SESSION_REPEAT = enum.auto()
+    SESSION_NOT_EXIST = enum.auto()
+    HANDLER_NOT_EXIST = enum.auto()
+
+
+@dataclass
+class Response:
+    """Pack all response information together.
+
+    Args:
+        text (str): the response text from the server. If the output text is
+            an empty str and the finish_reason is length, it means the session
+            length is reached.
+        generate_token_len (int): the response token length.
+        input_token_len (int): the input prompt token length. Note that it may
+            contains chat template part.
+        session_id (int): the id for running the session. Basically, it refers
+            to the position index of the input request batch.
+        finish_reason ('stop' | 'length' | None): the reason the model stopped
+            generating tokens. This will be 'stop' if the model hit a natural
+            stop point or a provided stop sequence, 'length' if the maximum
+            number of tokens specified in the request was reached.
+    """
+    text: str
+    generate_token_len: int
+    input_token_len: int
+    session_id: int
+    finish_reason: Optional[Literal['stop', 'length']] = None
--- a/lmdeploy/pytorch/adapter/__init__.py
+++ b/lmdeploy/pytorch/adapter/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
--- a/lmdeploy/pytorch/adapter/adapter.py
+++ b/lmdeploy/pytorch/adapter/adapter.py
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+import torch
+from torch import Tensor
+
+from ..block import LogicalTokenBlocks
+
+
+def _cache_weight(cache: Tensor, weight: Tensor, block_table: Tensor):
+    """cache weight."""
+    assert cache.dim() == 2
+    assert weight.dim() == 2
+    assert block_table.dim() == 1
+
+    rank, feat_size = weight.size()
+    assert cache.size(-1) >= feat_size, ('cache.size(-1) >= feat_size failed.')
+    assert rank <= block_table.size(0), ('rank <= block_table.size(0) failed.')
+    block_table = block_table[:rank]
+    cache[block_table, :feat_size] = weight.to(device=cache.device,
+                                               dtype=cache.dtype)
+
+
+def _get_named_loralinears(model: torch.nn.Module):
+    """get all named loralinear."""
+    from peft.tuners.lora import Linear as LoRALinear
+    named_loralinear: Dict[str, torch.nn.Module] = dict()
+    for name, module in model.named_modules():
+        if isinstance(module, LoRALinear):
+            named_loralinear[name] = module
+    return named_loralinear
+
+
+def _get_layer_index(key: str, config: Any):
+    """get layer index of the lora linear."""
+    layers_pattern = getattr(config, 'layers_pattern', None)
+    if isinstance(layers_pattern, str):
+        layers_pattern = [layers_pattern]
+    if layers_pattern is None or len(layers_pattern) == 0:
+        layer_index = re.match(r'.*\.[^.]*\.(\d+)\.', key)
+        return int(layer_index[1])
+    else:
+        for pattern in layers_pattern:
+            layer_index = re.match(f'.*.{pattern}\\.(\\d+)\\.*', key)
+
+            if layer_index is not None:
+                return int(layer_index[1])
+
+
+def get_indexed_lora_linears(model: torch.nn.Module):
+    """get indexed lora linear."""
+    named_linears = _get_named_loralinears(model)
+
+    config = None
+    peft_config = getattr(model, 'peft_config', dict)
+    if len(peft_config) > 0:
+        config = next(iter(peft_config.values()))
+
+    indexed_linears = dict()
+    for name, layer in named_linears.items():
+        index = _get_layer_index(name, config)
+        target = name.split('.')[-1]
+        indexed_linears.setdefault(index, dict())
+        indexed_linears[index][target] = layer
+    return indexed_linears
+
+
+def update_lora_linears(lora_linears: Dict,
+                        weight_maps: List['AdapterWeightMap'],
+                        device: str = 'cuda'):
+    """update lora linears."""
+
+    def __get_targets():
+        """get targets."""
+        all_targets = set()
+        for weight_map in weight_maps:
+            targets = weight_map.target_modules.keys()
+            all_targets.update(targets)
+        return all_targets
+
+    def __get_linear_meta(target_names):
+        """get rank and start."""
+        rank_map = dict()
+        start_map = dict()
+        scaling_map = dict()
+        for target in target_names:
+            ranks = [0] + [
+                weight_map.target_modules[target].rank
+                for weight_map in weight_maps
+            ]
+            block_starts = [0] + [
+                weight_map.target_modules[target].block_start
+                for weight_map in weight_maps
+            ]
+            scaling = [0] + [
+                weight_map.target_modules[target].scaling
+                for weight_map in weight_maps
+            ]
+            rank_map[target] = torch.tensor(ranks)
+            start_map[target] = torch.tensor(block_starts)
+            scaling_map[target] = torch.tensor(scaling)
+        return rank_map, start_map, scaling_map
+
+    def __update_linear(linear, idx, rank_map, start_map, scaling_map,
+                        adapter_names):
+        """update linear."""
+        linear.layer_idx = idx
+        linear.ranks = rank_map[target].to(device)
+        linear.block_starts = start_map[target].to(device)
+        linear.scaling = scaling_map[target].to(device)
+        for name in adapter_names:
+            if name in linear.lora_A:
+                linear.lora_A.pop(name)
+                linear.lora_B.pop(name)
+
+    adapter_names = [weight_map.adapter_name for weight_map in weight_maps]
+
+    all_targets = __get_targets()
+
+    for weight_map in weight_maps:
+        weight_map.expand_targets(all_targets)
+
+    rank_map, start_map, scaling_map = __get_linear_meta(all_targets)
+
+    for idx, lora_linear in lora_linears.items():
+        for target, linear in lora_linear.items():
+            __update_linear(linear,
+                            idx,
+                            rank_map=rank_map,
+                            start_map=start_map,
+                            scaling_map=scaling_map,
+                            adapter_names=adapter_names)
+
+
+def get_max_lora_weight_size(model: torch.nn.Module):
+    """Get max weight size."""
+    from peft.tuners.lora import Linear as LoRALinear
+    ret = 0
+    for _, mod in model.named_modules():
+        if isinstance(mod, LoRALinear):
+            weight = mod.base_layer.weight
+            ret = max(ret, max(weight.shape))
+    return ret
+
+
+@dataclass
+class TargetMeta:
+    rank: int
+    block_start: int
+    scaling: float
+
+
+@dataclass
+class AdapterWeightMap:
+    adapter_name: str
+    block_table: Tensor
+    target_modules: Dict[str, TargetMeta]
+
+    @classmethod
+    def new(cls, adapter_name: str, rank: int, target_names: List[str],
+            block_table: Tensor, scaling: float):
+        """create new weightmap."""
+        block_start = 0
+        target_modules: Dict[str, TargetMeta] = dict()
+        for name in target_names:
+            target_modules[name] = TargetMeta(rank, block_start, scaling)
+            block_start += rank
+
+        return AdapterWeightMap(adapter_name,
+                                block_table=block_table,
+                                target_modules=target_modules)
+
+    def expand_targets(self,
+                       target_names: List[str],
+                       ignore_exists: bool = True):
+        for name in target_names:
+            if name in self.target_modules:
+                if ignore_exists:
+                    continue
+                else:
+                    raise RuntimeError(f'target {name} exists.')
+            self.target_modules[name] = TargetMeta(0, 0, 0.0)
+
+    @classmethod
+    def cache_lora_a(cls, cache: Tensor, weight: Tensor, block_table: Tensor):
+        """cache lora a weight."""
+        return _cache_weight(cache, weight, block_table)
+
+    @classmethod
+    def cache_lora_b(cls, cache: Tensor, weight: Tensor, block_table: Tensor):
+        """cache lora b weight."""
+        return _cache_weight(cache, weight.t(), block_table)
+
+    def cache_lora_linear(self, lora_linear: torch.nn.Module, cache_a: Tensor,
+                          cache_b: Tensor):
+        """cache lora linear."""
+        name = self.adapter_name
+        target_modules = self.target_modules
+        block_table = self.block_table
+        block_start = 0
+        for target, target_meta in target_modules.items():
+            linear = lora_linear[target]
+            if not (name in linear.lora_A and name in linear.lora_B):
+                continue
+            linear_a = linear.lora_A[name]
+            linear_b = linear.lora_B[name]
+            weight_a = linear_a.weight
+            weight_b = linear_b.weight
+            assert weight_a is not None
+            assert weight_b is not None
+            rank = target_meta.rank
+            block_offset = block_table[block_start:block_start + rank]
+            block_start += rank
+            self.cache_lora_a(cache_a, weight_a, block_offset)
+            self.cache_lora_b(cache_b, weight_b, block_offset)
+
+    def cache_adapter(self, lora_linears: Dict, caches: List[List[Tensor]]):
+        """cache all linear."""
+        assert len(lora_linears) == len(caches), (
+            'len(lora_linears) == len(caches)')
+
+        for idx, lora_linear in lora_linears.items():
+            assert idx < len(caches), 'idx < len(caches)'
+            cache_a, cache_b = caches[idx]
+            self.cache_lora_linear(lora_linear, cache_a, cache_b)
+
+
+@dataclass
+class SchedulerAdapter:
+    """lora adapter."""
+
+    idx: int
+    adapter_path: str
+    adapter_name: str
+    config: Any
+    target_modules: List[str]
+    logical_blocks: LogicalTokenBlocks
+    adapter_manager: 'AdapterManager'
+    _active: bool = False
+
+    @classmethod
+    def from_pretrained(cls, adapter_path: str, adapter_name: str, idx: int,
+                        manager: 'AdapterManager'):
+        """from_pretrained."""
+        from peft import PeftConfig
+        config = PeftConfig.from_pretrained(adapter_path)
+
+        return cls.from_config(config,
+                               adapter_name=adapter_name,
+                               idx=idx,
+                               manager=manager)
+
+    @classmethod
+    def from_config(cls, config: Any, adapter_name: str, idx: int,
+                    manager: 'AdapterManager'):
+        """from config."""
+        new_adapter = SchedulerAdapter(
+            idx,
+            adapter_path=config.base_model_name_or_path,
+            adapter_name=adapter_name,
+            config=config,
+            target_modules=list(config.target_modules),
+            logical_blocks=LogicalTokenBlocks(),
+            adapter_manager=manager)
+        new_adapter._active = False
+        return new_adapter
+
+    @property
+    def name(self):
+        """get adapter name."""
+        return self.adapter_name
+
+    @property
+    def rank(self):
+        """get rank."""
+        return self.config.r
+
+    @property
+    def scaling(self):
+        """get scaling."""
+        return self.config.lora_alpha / self.rank
+
+    def is_actived(self):
+        """check if adapter is active."""
+        return self._active
+
+    def active(self, flag: bool = True):
+        """active adapter."""
+        self.adapter_manager._on_active(self, flag)
+        self._active = flag
+
+    def build_weight_map(self, block_table: Tensor):
+        return AdapterWeightMap.new(self.name,
+                                    rank=self.rank,
+                                    target_names=self.target_modules,
+                                    block_table=block_table,
+                                    scaling=self.scaling)
+
+
+class AdapterManager:
+    """Adapter manager."""
+
+    def __init__(self) -> None:
+        self._adapters: Dict[str, SchedulerAdapter] = dict()
+        self._adapter_count = 0
+        self._active_count = 0
+
+        self._add_non_adapter()
+
+    def _add_non_adapter(self):
+        """add non adapter."""
+        from peft import LoraConfig
+        adapter_name = None
+        config = LoraConfig(r=0, target_modules=[])
+        adapter = self.add_adapter_from_config(config,
+                                               adapter_name=adapter_name)
+        adapter.active()
+
+    def _on_active(self, adapter: SchedulerAdapter, flag: bool):
+        """on active."""
+        if adapter._active != flag:
+            if flag:
+                self._active_count += 1
+            else:
+                self._active_count -= 1
+
+    def _add_adapter(self, adapter: SchedulerAdapter):
+        """add adapter."""
+        assert adapter.adapter_name not in self._adapters
+        self._adapters[adapter.adapter_name] = adapter
+        self._adapter_count += 1
+        return adapter
+
+    def add_adapter_from_config(self, config: Any, adapter_name: str):
+        """add adapter from config."""
+        adapter = SchedulerAdapter.from_config(config,
+                                               adapter_name=adapter_name,
+                                               idx=self._adapter_count,
+                                               manager=self)
+        return self._add_adapter(adapter)
+
+    def add_adapter_from_pretrained(self, adapter_path: str,
+                                    adapter_name: str):
+        """add adapter by path and name."""
+        adapter = SchedulerAdapter.from_pretrained(adapter_path,
+                                                   adapter_name=adapter_name,
+                                                   idx=self._adapter_count,
+                                                   manager=self)
+        return self._add_adapter(adapter)
+
+    def get_adapter(self, name: str, default=None):
+        """get adapter."""
+        return self._adapters.get(name, default)
+
+    def num_adapters(self):
+        """get num adapters."""
+        return len(self._adapters)
+
+
+ADAPTER_MANAGER = AdapterManager()
--- a/lmdeploy/pytorch/block.py
+++ b/lmdeploy/pytorch/block.py
+# Copyright (c) OpenMMLab. All rights reserved.
+# modify from: https://github.com/vllm-project/vllm
+from dataclasses import dataclass
+
+import numpy as np
+
+
+def _div_up(x, n):
+    """perform div up."""
+    return (x + n - 1) // n
+
+
+def _round_up(x, n):
+    """perform round up."""
+    return _div_up(x, n) * n
+
+
+class LogicalTokenBlocks:
+    """Logical blocks."""
+    ALLOC_SIZE = 128
+
+    def __init__(self, blocks: np.ndarray = None):
+        if blocks is None:
+            self._blocks = np.zeros((self.ALLOC_SIZE, ), dtype=np.int64)
+            self._num_real = 0
+        else:
+            assert blocks.ndim == 1
+            self._blocks = blocks
+            self._num_real = len(blocks)
+
+    def reserve(self, size: int):
+        """reserve cache size."""
+        num_blocks = self._blocks.size
+        if num_blocks >= size:
+            return
+        reserve_size = _round_up(size - num_blocks, self.ALLOC_SIZE)
+        self._blocks = np.pad(self._blocks, (0, reserve_size))
+
+    def __setitem__(self, *args, **kwargs):
+        """set values."""
+        return self.get_real_blocks().__setitem__(*args, **kwargs)
+
+    def __getitem__(self, *args, **kwargs):
+        """get values."""
+        return self.get_real_blocks().__getitem__(*args, **kwargs)
+
+    def get_real_blocks(self):
+        """get logical blocks."""
+        return self._blocks[:self._num_real]
+
+    def append(self, blocks: np.ndarray):
+        """append blocks."""
+        num_blocks = len(blocks)
+        self.reserve(num_blocks + self._num_real)
+        slice_start = self._num_real
+        slice_end = slice_start + num_blocks
+        self._num_real += num_blocks
+        self.__setitem__(slice(slice_start, slice_end), blocks)
+
+    def __len__(self):
+        """get length."""
+        return self._num_real
+
+    def resize(self, num_blocks: int):
+        """resize logical blocks."""
+        assert num_blocks <= len(self)
+        self._num_real = num_blocks
+
+    def reset(self):
+        """reset."""
+        self.resize(0)
+
+    def clone(self):
+        """clone logical blocks."""
+        ret = LogicalTokenBlocks()
+        ret.append(self[:])
+        return ret
+
+
+@dataclass
+class PhysicalTokenBlock:
+    """Physical block used to schedule key value cache."""
+
+    device: str
+    block_id: int
+    block_size: int
+    ref_count: int = 0
--- a/lmdeploy/pytorch/check_env/__init__.py
+++ b/lmdeploy/pytorch/check_env/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import Logger
+from typing import List
+
+from lmdeploy.utils import get_logger
+
+
+def _handle_exception(e: Exception,
+                      mod_name: str,
+                      logger: Logger,
+                      message: str = None):
+    red_color = '\033[31m'
+    reset_color = '\033[0m'
+    if message is None:
+        message = 'Please ensure it has been installed correctly.'
+    logger.debug('Exception', exc_info=1)
+    logger.error(f'{type(e).__name__}: {e}')
+    logger.error(f'{red_color}'
+                 f'<{mod_name}> test failed!\n'
+                 f'{message}'
+                 f'{reset_color}')
+    exit(1)
+
+
+def check_env_torch():
+    """check PyTorch environment."""
+    logger = get_logger('lmdeploy')
+
+    try:
+        logger.debug('Checking <PyTorch> environment.')
+        import torch
+
+        a = torch.tensor([1, 2], device='cuda')
+        b = a.new_tensor([3, 4], device='cuda')
+        c = a + b
+        torch.testing.assert_close(c, a.new_tensor([4, 6]))
+    except Exception as e:
+        _handle_exception(e, 'PyTorch', logger)
+
+
+def check_env_triton():
+    """check OpenAI Triton environment."""
+    logger = get_logger('lmdeploy')
+
+    try:
+        logger.debug('Checking <Triton> environment.')
+        import torch
+
+        from .triton_custom_add import custom_add
+        a = torch.tensor([1, 2], device='cuda')
+        b = a.new_tensor([3, 4], device='cuda')
+        c = custom_add(a, b)
+        torch.testing.assert_close(c, a + b)
+    except Exception as e:
+        _handle_exception(e, 'Triton', logger)
+
+
+def check_env():
+    """check all environment."""
+    logger = get_logger('lmdeploy')
+    logger.info('Checking environment for PyTorch Engine.')
+    check_env_torch()
+    check_env_triton()
+
+
+def check_transformers_version(model_path: str,
+                               trust_remote_code: bool = True):
+    """check transformers version."""
+    from packaging import version
+    logger = get_logger('lmdeploy')
+
+    def __check_transformers_version():
+        """check transformers version."""
+        logger.debug('Checking <transformers> version.')
+        trans_version = None
+        try:
+            import transformers
+            trans_version = version.parse(transformers.__version__)
+        except Exception as e:
+            _handle_exception(e, 'transformers', logger)
+        return transformers, trans_version
+
+    def __check_config(trans_version):
+        """check config."""
+        logger.debug('Checking <Model> AutoConfig.from_pretrained.')
+        try:
+            from transformers import AutoConfig
+            config = AutoConfig.from_pretrained(
+                model_path, trust_remote_code=trust_remote_code)
+        except Exception as e:
+            message = (
+                f'Load model config with transformers=={trans_version}'
+                ' failed. '
+                'Please make sure model can be loaded with transformers API.')
+            _handle_exception(e, 'transformers', logger, message=message)
+        return config
+
+    def __check_model_transformers_version(config, trans_version):
+        """check model transformers version."""
+        logger.debug('Checking <Model> required transformers version.')
+        try:
+            model_trans_version = getattr(config, 'transformers_version')
+            model_trans_version = version.parse(model_trans_version)
+            assert trans_version >= model_trans_version, 'Version mismatch.'
+        except Exception as e:
+            message = (f'model `{model_path}` requires '
+                       f'transformers version {model_trans_version} '
+                       f'but transformers {trans_version} is installed.')
+            _handle_exception(e, 'transformers', logger, message=message)
+
+    def __check_model_dtype_support(config):
+        """Checking model dtype support."""
+        logger.debug('Checking <Model> dtype support.')
+
+        import torch
+
+        from lmdeploy.pytorch.config import ModelConfig
+
+        try:
+            model_config = ModelConfig.from_hf_config(config,
+                                                      model_path=model_path)
+            if model_config.dtype == torch.bfloat16:
+                assert torch.cuda.is_bf16_supported(), (
+                    'bf16 is not supported on your device')
+        except AssertionError as e:
+            message = (f'Your device does not support `{model_config.dtype}`. '
+                       'Try edit `torch_dtype` in `config.json`.\n'
+                       'Note that this might have negative effect!')
+            _handle_exception(e, 'Model', logger, message=message)
+        except Exception as e:
+            message = (f'Checking failed with error {e}',
+                       'Please send issue to LMDeploy with error logs.')
+            _handle_exception(e, 'Model', logger, message=message)
+
+        return model_config
+
+    _, trans_version = __check_transformers_version()
+    config = __check_config(trans_version)
+    __check_model_transformers_version(config, trans_version)
+    __check_model_dtype_support(config)
+
+
+def check_model(model_path: str, trust_remote_code: bool = True):
+    """check model requirements."""
+    logger = get_logger('lmdeploy')
+    logger.info('Checking model.')
+    check_transformers_version(model_path, trust_remote_code)
+
+
+def check_adapter(path: str):
+    """check adapter."""
+    logger = get_logger('lmdeploy')
+    logger.debug(f'Checking <Adapter>: {path}.')
+
+    try:
+        from peft import PeftConfig
+        PeftConfig.from_pretrained(path)
+    except Exception as e:
+        message = ('Please make sure the adapter can be loaded with '
+                   '`peft.PeftConfig.from_pretrained`\n')
+        err_msg = '' if len(e.args) == 0 else e.args[0]
+        if 'got an unexpected keyword argument' in err_msg:
+            message += ('Or try remove all unexpected keywords '
+                        'in `adapter_config.json`.')
+        _handle_exception(e, 'Model', logger, message=message)
+
+
+def check_adapters(adapter_paths: List[str]):
+    """check adapters."""
+    if len(adapter_paths) <= 0:
+        return
+    logger = get_logger('lmdeploy')
+    logger.info('Checking adapters.')
+    for path in adapter_paths:
+        check_adapter(path)
--- a/lmdeploy/pytorch/check_env/triton_custom_add.py
+++ b/lmdeploy/pytorch/check_env/triton_custom_add.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _add_kernel(A, B, C, size, BLOCK: tl.constexpr):
+    """add kernel."""
+    prog_id = tl.program_id(0)
+    offs = prog_id * BLOCK + tl.arange(0, BLOCK)
+    a = tl.load(A + offs, mask=offs < size)
+    b = tl.load(B + offs, mask=offs < size)
+    tl.store(C + offs, a + b, mask=offs < size)
+
+
+def custom_add(a, b):
+    """custom add one."""
+    c = torch.empty_like(a)
+    size = c.size(0)
+    BLOCK = 16
+
+    grid = [triton.cdiv(size, BLOCK)]
+    _add_kernel[grid](a, b, c, size, BLOCK=BLOCK)
+    return c
--- a/lmdeploy/pytorch/config.py
+++ b/lmdeploy/pytorch/config.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from dataclasses import dataclass, field
+from typing import Any, Dict
+
+import torch
+
+
+def _get_torch_dtype(config: Any, default: str = 'float16'):
+    """Get the torch dtype from the model config.
+
+    Args:
+        config: Config of the hf model.
+        default (str): default device type.
+    """
+    torch_dtype = getattr(config, 'torch_dtype', default)
+    # torch_dtype in config could be none
+    torch_dtype = torch_dtype or default
+    return eval(f'torch.{torch_dtype}')
+
+
+@dataclass
+class SchedulerConfig:
+    """Config of scheduler."""
+
+    max_batches: int
+    max_session_len: int
+    max_request_output_len: int = 512
+    eviction_type: str = 'recompute'
+    prefill_interval: int = 16
+    max_active_adapters: int = 64
+
+
+@dataclass
+class CacheConfig:
+    """Config of key value cache."""
+
+    block_size: int
+    num_cpu_blocks: int
+    num_gpu_blocks: int
+    window_size: int = -1
+    cache_max_entry_count: float = 0.8
+    max_prefill_token_num: int = 4096
+
+
+@dataclass
+class ModelConfig:
+    """Config of model."""
+
+    hidden_size: int
+    num_layers: int
+    num_attention_heads: int
+    num_key_value_heads: int
+    bos_token_id: int
+    eos_token_id: int
+    head_dim: int
+    sliding_window: int = -1
+    dtype: torch.dtype = torch.float16
+    multi_query_attention: bool = False
+    vocab_size: int = 40000
+    json_config: dict = field(default_factory=dict)
+    hf_config: Any = None
+    init_kwargs: Dict[str, Any] = field(default_factory=dict)
+
+    def get_head_size(self):
+        """get head size."""
+        return self.head_dim
+
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path: str,
+                        trust_remote_code: bool = True):
+        """build ModelConfig from model path or name."""
+        from transformers import AutoConfig
+        hf_config = AutoConfig.from_pretrained(
+            pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
+        return cls.from_hf_config(hf_config, pretrained_model_name_or_path)
+
+    @classmethod
+    def from_hf_config(cls, hf_config: Any, model_path: str = None):
+        """from huggingface config."""
+        if model_path is None:
+            model_path = ''
+
+        def __build_falcon():
+            """build falcon."""
+            num_attention_heads = hf_config.num_attention_heads
+            if hf_config.new_decoder_architecture:
+                # 40b-instruct, GQA
+                kv_head = hf_config.num_kv_heads
+            if hf_config.multi_query:
+                # 7b-instruct, MQA
+                kv_head = 1
+            else:
+                # rw-1b, MHA
+                kv_head = num_attention_heads
+            head_dim = hf_config.hidden_size // num_attention_heads
+            return ModelConfig(
+                hidden_size=hf_config.hidden_size,
+                num_layers=hf_config.num_hidden_layers,
+                num_attention_heads=num_attention_heads,
+                num_key_value_heads=kv_head,
+                bos_token_id=hf_config.bos_token_id,
+                eos_token_id=hf_config.eos_token_id,
+                head_dim=head_dim,
+                multi_query_attention=hf_config.multi_query,
+                vocab_size=hf_config.vocab_size,
+            )
+
+        def __build_chatglm():
+            """build chatglm."""
+            head_dim = hf_config.hidden_size // hf_config.num_attention_heads
+            bos_token_id = hf_config.bos_token_id
+            if bos_token_id is None:
+                bos_token_id = hf_config.pad_token_id
+            init_kwargs = dict(empty_init=False)
+            return ModelConfig(
+                hidden_size=hf_config.hidden_size,
+                num_layers=hf_config.num_layers,
+                num_attention_heads=hf_config.num_attention_heads,
+                num_key_value_heads=hf_config.multi_query_group_num,
+                bos_token_id=bos_token_id,
+                eos_token_id=hf_config.eos_token_id,
+                head_dim=head_dim,
+                vocab_size=hf_config.padded_vocab_size,
+                init_kwargs=init_kwargs)
+
+        def __build_gemma():
+            return ModelConfig(
+                hidden_size=hf_config.hidden_size,
+                num_layers=hf_config.num_hidden_layers,
+                num_attention_heads=hf_config.num_attention_heads,
+                num_key_value_heads=hf_config.num_key_value_heads,
+                bos_token_id=hf_config.bos_token_id,
+                eos_token_id=hf_config.eos_token_id,
+                head_dim=hf_config.head_dim,
+                vocab_size=hf_config.vocab_size)
+
+        def __build_default():
+            head_dim = hf_config.hidden_size // hf_config.num_attention_heads
+            num_attention_heads = hf_config.num_attention_heads
+            num_key_value_heads = getattr(hf_config, 'num_key_value_heads',
+                                          num_attention_heads)
+            use_sliding_window = getattr(hf_config, 'use_sliding_window', True)
+            sliding_window = -1
+            if use_sliding_window:
+                sliding_window = getattr(hf_config, 'sliding_window',
+                                         sliding_window) or -1
+            return ModelConfig(
+                hidden_size=hf_config.hidden_size,
+                num_layers=hf_config.num_hidden_layers,
+                num_attention_heads=hf_config.num_attention_heads,
+                num_key_value_heads=num_key_value_heads,
+                bos_token_id=hf_config.bos_token_id,
+                eos_token_id=hf_config.eos_token_id,
+                sliding_window=sliding_window,
+                head_dim=head_dim,
+                vocab_size=hf_config.vocab_size)
+
+        if 'falcon' in model_path:
+            model_config = __build_falcon()
+        elif 'chatglm' in model_path:
+            model_config = __build_chatglm()
+        elif hf_config.model_type == 'gemma':
+            model_config = __build_gemma()
+        else:
+            model_config = __build_default()
+
+        model_config.dtype = _get_torch_dtype(hf_config)
+        model_config.hf_config = hf_config
+        model_config.json_config = hf_config.to_dict()
+        return model_config
--- a/lmdeploy/pytorch/dist_utils.py
+++ b/lmdeploy/pytorch/dist_utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Union
+
+import torch
+from torch import Tensor, nn
+from torch.distributed._tensor import (DeviceMesh, DTensor, Replicate, Shard,
+                                       distribute_tensor)
+
+from lmdeploy.pytorch.models.q_modules import QLinear
+
+try:
+    from peft.tuners.lora import Linear as LoRALinear
+except ImportError:
+
+    class LoRALinear:
+        pass
+
+
+def try_to_local(tensor: Union[Tensor, DTensor]):
+    """Try to convert DTensor to Tensor.
+
+    Args:
+        tensor (Tensor|DTensor): Tensor to convert.
+    """
+    if isinstance(tensor, DTensor):
+        tensor = tensor.to_local()
+    return tensor
+
+
+def module_to_local(module: nn.Module):
+    """convert all DTensor parameters to Tensor parameters in module.
+
+    Args:
+        module (Module): Module to convert.
+    """
+    for name, mod in module.named_children():
+        module_to_local(mod)
+
+    for name, param in module.named_parameters(recurse=False):
+        module.register_parameter(name, nn.Parameter(try_to_local(param)))
+
+    for name, buf in module.named_buffers(recurse=False):
+        module.register_buffer(name, try_to_local(buf))
+
+
+def rowwise_parallelize_linear(module: nn.Module,
+                               device_mesh: DeviceMesh,
+                               to_local: bool = False) -> None:
+    """
+    This function parallelizes the input :class:`nn.Linear` module in
+    :class:`RowwiseParallel` style.
+
+    Args:
+        module (:class:`nn.Module`):
+            The :class:`nn.Linear` module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology of devices.
+
+    Returns:
+        None
+    """
+    for name, param in module.named_parameters():
+        dist_spec = ([Shard(1)] if name == 'weight' else
+                     [Replicate()]  # type: ignore[list-item]
+                     )
+
+        dist_tensor = distribute_tensor(param, device_mesh, dist_spec)
+        if to_local:
+            dist_tensor = try_to_local(dist_tensor)
+            if name == 'bias':
+                # rowwise linear would add bias more than ones.
+                dist_tensor /= device_mesh.size()
+        dist_param = torch.nn.Parameter(dist_tensor)
+        module.register_parameter(name, dist_param)
+
+    # Weight, bias and scale are registered as buffer in QLinear
+    for name, buffer in module.named_buffers():
+        dist_spec = ([Shard(1)] if name == 'weight' else
+                     [Replicate()]  # type: ignore[list-item]
+                     )
+
+        dist_tensor = distribute_tensor(buffer, device_mesh, dist_spec)
+        if to_local:
+            dist_tensor = try_to_local(dist_tensor)
+            if name == 'bias':
+                # rowwise linear would add bias more than ones.
+                dist_tensor /= device_mesh.size()
+        module.register_buffer(name, dist_tensor)
+
+        dist_tensor = distribute_tensor(buffer, device_mesh, dist_spec)
+        if to_local:
+            dist_tensor = try_to_local(dist_tensor)
+        module.register_buffer(name, dist_tensor)
+
+
+def rowwise_parallelize_loralinear(module: LoRALinear,
+                                   device_mesh: DeviceMesh,
+                                   to_local: bool = False) -> None:
+    """rowwize parallelize lora linear.
+
+    Read S-LoRA for more detail.
+    """
+    rowwise_parallelize_linear(module.base_layer,
+                               device_mesh=device_mesh,
+                               to_local=to_local)
+    for mod in module.lora_A.values():
+        rowwise_parallelize_linear(mod,
+                                   device_mesh=device_mesh,
+                                   to_local=to_local)
+    for mod in module.lora_B.values():
+        colwise_parallelize_linear(mod,
+                                   device_mesh=device_mesh,
+                                   to_local=to_local)
+    module._tp_mode = 'rowwise'
+
+
+def rowwise_parallelize_linear_fn(module: nn.Module,
+                                  device_mesh: DeviceMesh,
+                                  to_local: bool = False) -> None:
+    """
+    This function parallelizes the input :Linear module in
+    :class:`RowwiseParallel` style.
+
+    Args:
+        module (:class:`nn.Module`):
+            The :class:`nn.Linear` module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology of devices.
+
+    Returns:
+        None
+    """
+    if isinstance(module, (torch.nn.Linear, QLinear)):
+        return rowwise_parallelize_linear(module,
+                                          device_mesh=device_mesh,
+                                          to_local=to_local)
+    elif isinstance(module, LoRALinear):
+        return rowwise_parallelize_loralinear(module,
+                                              device_mesh=device_mesh,
+                                              to_local=to_local)
+    else:
+        raise TypeError(f'Unsupported module: {type(module)}')
+
+
+def colwise_parallelize_linear(module: nn.Module,
+                               device_mesh: DeviceMesh,
+                               to_local: bool = False) -> None:
+    """
+    This function parallelizes the input :class:`nn.Linear` module in
+    :class:`ColwiseParallel` style.
+
+    Args:
+        module (:class:`nn.Module`):
+            The :class:`nn.Linear` module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology of devices.
+
+    Returns:
+        None
+    """
+
+    for name, param in module.named_parameters():
+        dist_tensor = distribute_tensor(param, device_mesh, [Shard(0)])
+        if to_local:
+            dist_tensor = try_to_local(dist_tensor)
+        dist_param = torch.nn.Parameter(dist_tensor)
+        module.register_parameter(name, dist_param)
+    # Weight, bias and scale are registered as buffer in QLinear
+    for name, buffer in module.named_buffers():
+        dist_tensor = distribute_tensor(buffer, device_mesh, [Shard(0)])
+        if to_local:
+            dist_tensor = try_to_local(dist_tensor)
+        module.register_buffer(name, dist_tensor)
+
+
+def colwise_parallelize_loralinear(module: nn.Module,
+                                   device_mesh: DeviceMesh,
+                                   to_local: bool = False) -> None:
+    """colwise parallelize lora linear."""
+    colwise_parallelize_linear(module.base_layer,
+                               device_mesh=device_mesh,
+                               to_local=to_local)
+    for mod in module.lora_A.values():
+        colwise_parallelize_linear(mod,
+                                   device_mesh=device_mesh,
+                                   to_local=to_local)
+    for mod in module.lora_B.values():
+        colwise_parallelize_linear(mod,
+                                   device_mesh=device_mesh,
+                                   to_local=to_local)
+    module._tp_mode = 'colwise'
+
+
+def colwise_parallelize_linear_fn(module: nn.Module,
+                                  device_mesh: DeviceMesh,
+                                  to_local: bool = False) -> None:
+    """
+    This function parallelizes the input :Linear module in
+    :class:`ColwiseParallel` style.
+
+    Args:
+        module (:class:`nn.Module`):
+            The :class:`nn.Linear` module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology of devices.
+
+    Returns:
+        None
+    """
+    if isinstance(module, (torch.nn.Linear, QLinear)):
+        return colwise_parallelize_linear(module,
+                                          device_mesh=device_mesh,
+                                          to_local=to_local)
+    elif isinstance(module, LoRALinear):
+        return colwise_parallelize_loralinear(module,
+                                              device_mesh=device_mesh,
+                                              to_local=to_local)
+    else:
+        raise TypeError(f'Unsupported module: {type(module)}')
+
+
+def _partition_module(
+    mod_name: str,
+    prefix: str,
+    module: nn.Module,
+    device_mesh: DeviceMesh,
+    func: Callable,
+):
+    """partition module.
+
+    Parameters in module won't be force Replicated.
+
+    Args:
+        mod_name (str): module name.
+        prefix (str): Parameter prefix.
+        module (Module): Module to be partitioned.
+        device_mesh (DeviceMesh): The device mesh.
+        func (Callable): partition callback
+    """
+    for name, mod in module.named_children():
+        child_name = f'{prefix}{name}'
+        _partition_module(child_name,
+                          child_name + '.',
+                          module=mod,
+                          device_mesh=device_mesh,
+                          func=func)
+
+    func(mod_name, module, device_mesh)
+
+
+def partition_module(module: nn.Module,
+                     device_mesh: DeviceMesh,
+                     func: Callable,
+                     to_local: bool = False):
+    """partition module.
+
+    Parameters in module won't be force Replicated.
+
+    Args:
+        module (Module): Module to be partitioned.
+        device_mesh (DeviceMesh): The device mesh.
+        func (Callable): partition callback.
+        to_local (bool): Convert all DTensor parameters to Tensor parameters.
+    """
+    _partition_module('',
+                      '',
+                      module=module,
+                      device_mesh=device_mesh,
+                      func=func)
+
+    if to_local:
+        module_to_local(module)
+
+
+def replicate_module(model: nn.Module, device_mesh: DeviceMesh):
+    """Replicate all parameters in module.
+
+    Args:
+        model (Module): Module to perform replicate.
+        device_mesh (DeviceMesh): The distribution device mesh.
+    """
+    for name, param in model.named_parameters(recurse=False):
+        param = distribute_tensor(param,
+                                  device_mesh=device_mesh,
+                                  placements=[Replicate()]).to_local()
+        param = nn.Parameter(param)
+        model.register_parameter(name, param)
+
+    for name, buf in model.named_buffers(recurse=False):
+        buf = distribute_tensor(buf,
+                                device_mesh=device_mesh,
+                                placements=[Replicate()]).to_local()
+        model.register_buffer(name, buf)
--- a/lmdeploy/pytorch/engine/__init__.py
+++ b/lmdeploy/pytorch/engine/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .engine import Engine
+
+__all__ = ['Engine']
--- a/lmdeploy/pytorch/engine/cache_engine.py
+++ b/lmdeploy/pytorch/engine/cache_engine.py
+# Copyright (c) OpenMMLab. All rights reserved.
+# modify from: https://github.com/vllm-project/vllm
+from typing import Dict, List, Tuple
+
+import torch
+from torch.distributed._tensor import DeviceMesh
+
+from lmdeploy.utils import get_logger
+
+from ..config import CacheConfig, ModelConfig
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+logger = get_logger('lmdeploy')
+
+
+class CacheEngine:
+    """Host and Device memory maintainer.
+
+    Args:
+        cache_config (CacheConfig): config of the cache information.
+        model_config (ModelConfig): config of the model.
+        rank (int): distribution rank, 0 on non-distributed environment.
+        world_size (int): distribution world size, 1 on non-distributed
+            environment.
+        device_mesh (DeviceMesh): distribution device mesh.
+    """
+
+    def __init__(
+        self,
+        cache_config: CacheConfig,
+        model_config: ModelConfig,
+        rank: int = 0,
+        world_size: int = 1,
+        device_mesh: DeviceMesh = None,
+    ) -> None:
+        if rank == 0:
+            logger.info(f'build CacheEngine with config:{cache_config}')
+        self.rank = rank
+        self.world_size = world_size
+        if device_mesh is None and self.world_size > 1:
+            device_mesh = DeviceMesh('cuda', list(range(self.world_size)))
+        self.device_mesh = device_mesh
+
+        self.cache_config = cache_config
+        self.model_config = model_config
+
+        self.block_size = cache_config.block_size
+
+        self.head_size = model_config.get_head_size()
+        self.num_layers = model_config.num_layers
+        self.num_heads = model_config.num_key_value_heads
+
+        if 'kv_cache_dtype' in model_config.json_config:
+            self.kv_cache_dtype = eval(
+                model_config.json_config['kv_cache_dtype'])
+        else:
+            self.kv_cache_dtype = model_config.dtype
+
+        # Initialize the cache.
+        self.local_gpu_cache = self.allocate_gpu_cache()
+        self.local_cpu_cache = self.allocate_cpu_cache()
+
+        # Initialize the stream for caching operations.
+        self.cache_stream = torch.cuda.Stream()
+        assert self.cache_stream != torch.cuda.current_stream()
+        # Initialize the events for stream synchronization.
+        self.events = [torch.cuda.Event() for _ in range(self.num_layers)]
+
+        logger.debug(
+            f'Initialize cache engine with {cache_config.num_gpu_blocks}'
+            f' gpu blocks and {cache_config.num_cpu_blocks} cpu blocks.')
+
+    @property
+    def cpu_cache(self):
+        """gpu cache."""
+        return self.local_cpu_cache
+
+    @property
+    def gpu_cache(self):
+        """gpu cache."""
+        return self.local_gpu_cache
+
+    @property
+    def num_gpu_blocks(self):
+        """num gpu blocks."""
+        return self.cache_config.num_gpu_blocks
+
+    @property
+    def num_cpu_blocks(self):
+        """num gpu blocks."""
+        return self.cache_config.num_cpu_blocks
+
+    def get_key_block_shape(self, local: bool = False) -> Tuple[int, int, int]:
+        """get shape of key block."""
+        num_heads = self.num_heads
+        if local and not self.model_config.multi_query_attention:
+            assert self.num_heads % self.world_size == 0, \
+                f'num_heads: {self.num_heads}, world_size: {self.world_size}'
+            num_heads = self.num_heads // self.world_size
+        return (
+            self.block_size,
+            num_heads,
+            self.head_size,
+        )
+
+    def get_value_block_shape(self,
+                              local: bool = False) -> Tuple[int, int, int]:
+        """get shape of value block."""
+        num_heads = self.num_heads
+        if local and not self.model_config.multi_query_attention:
+            assert self.num_heads % self.world_size == 0, \
+                f'num_heads: {self.num_heads}, world_size: {self.world_size}'
+            num_heads = self.num_heads // self.world_size
+        return (
+            self.block_size,
+            num_heads,
+            self.head_size,
+        )
+
+    def allocate_gpu_cache(self):
+        """allocate caches on GPU."""
+        gpu_cache: List[KVCache] = []
+        key_block_shape = self.get_key_block_shape(local=True)
+        value_block_shape = self.get_value_block_shape(local=True)
+
+        for _ in range(self.num_layers):
+            key_blocks = torch.empty(
+                size=(self.num_gpu_blocks, *key_block_shape),
+                dtype=self.kv_cache_dtype,
+                device='cuda',
+            )
+            value_blocks = torch.empty(
+                size=(self.num_gpu_blocks, *value_block_shape),
+                dtype=self.kv_cache_dtype,
+                device='cuda',
+            )
+            gpu_cache.append((key_blocks, value_blocks))
+
+        return gpu_cache
+
+    def allocate_cpu_cache(self):
+        """allocate caches on Host."""
+        cpu_cache: List[KVCache] = []
+        key_block_shape = self.get_key_block_shape(local=True)
+        value_block_shape = self.get_value_block_shape(local=True)
+
+        # TODO: pin memory might need be banned on wsl
+        pin_memory = True
+
+        for _ in range(self.num_layers):
+            key_blocks = torch.empty(
+                size=(self.num_cpu_blocks, *key_block_shape),
+                dtype=self.kv_cache_dtype,
+                pin_memory=pin_memory,
+            )
+            value_blocks = torch.empty(
+                size=(self.num_cpu_blocks, *value_block_shape),
+                dtype=self.kv_cache_dtype,
+                pin_memory=pin_memory,
+            )
+            cpu_cache.append((key_blocks, value_blocks))
+        return cpu_cache
+
+    def _swap(self, src: List[KVCache], dst: List[KVCache],
+              src_to_dst: Dict[int, int]):
+        """Move caches from src memory to dst memory.
+
+        Args:
+            src (List[KVCache]): Source cache.
+            dst (List[KVCache]): Destination cache.
+            src_to_dst (Dict[int, int]): Map between src and dst.
+        """
+        with torch.cuda.stream(self.cache_stream):
+            for i in range(self.num_layers):
+                src_key_cache, src_value_cache = src[i]
+                dst_key_cache, dst_value_cache = dst[i]
+
+                for src_id, dst_id in src_to_dst.items():
+                    dst_key_cache[dst_id].copy_(src_key_cache[src_id])
+                    dst_value_cache[dst_id].copy_(src_value_cache[src_id])
+
+                    event = self.events[i]
+                    event.record(stream=self.cache_stream)
+
+    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
+        """Move cache from Host to Device.
+
+        Args:
+            src_to_dst (Dict[int, int]): Map between src and dst.
+        """
+        self._swap(self.local_cpu_cache, self.local_gpu_cache, src_to_dst)
+
+    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
+        """Move cache from Device to Host.
+
+        Args:
+            src_to_dst (Dict[int, int]): Map between src and dst.
+        """
+        self._swap(self.local_gpu_cache, self.local_cpu_cache, src_to_dst)
+
+    @staticmethod
+    def get_cache_block_size(block_size: int,
+                             model_config: ModelConfig,
+                             world_size: int = 1) -> int:
+        """Get the required cache size of the model.
+
+        Args:
+            block_size (int): The token numbers of the block.
+            model_config (ModelConfig): The config of the model.
+
+        Return:
+            int: Required memory size in bytes.
+        """
+        head_size = model_config.get_head_size()
+        num_layers = model_config.num_layers
+        num_heads = model_config.num_key_value_heads
+        if not model_config.multi_query_attention:
+            num_heads = num_heads // world_size
+
+        key_cache_block = block_size * num_heads * head_size
+        value_cache_block = key_cache_block
+        total = num_layers * (key_cache_block + value_cache_block)
+
+        dtype_size = _get_dtype_size(model_config.dtype)
+        return dtype_size * total
+
+
+def _get_dtype_size(dtype: torch.dtype) -> int:
+    """get size of the given dtype.
+
+    Args:
+        dtype (torch.dtype): Data type.
+
+    Return:
+        int: size in bytes.
+    """
+    return torch.tensor([], dtype=dtype).element_size()