Refactor model conversion (#296)

* split deploy.py * fix get_cuda_tensor * deploy qwen_awq * fix lint * add docstring * fix * support baichuan/baichuan-awq * parameterizing size_per_head * remove try/except * limit input model_format * add quant_path param * remove old deploy.py * fix path * fix transformer layer range when load bins * fix qwen init * split & save log * relative import * update get_config * WeightFileMgr -> Reader * rename * update * fix init_layer_id * rename llama.py -> meta_llama.py, hf.py -> llama.py * reduce code * update arg description * fix meta llama * manually cleanup meta model params

Refactor model conversion (#296)
* split deploy.py * fix get_cuda_tensor * deploy qwen_awq * fix lint * add docstring * fix * support baichuan/baichuan-awq * parameterizing size_per_head * remove try/except * limit input model_format * add quant_path param * remove old deploy.py * fix path * fix transformer layer range when load bins * fix qwen init * split & save log * relative import * update get_config * WeightFileMgr -> Reader * rename * update * fix init_layer_id * rename llama.py -> meta_llama.py, hf.py -> llama.py * reduce code * update arg description * fix meta llama * manually cleanup meta model params
823ad849 · Chen Xin · GitHub · 1bbc6e05 · 823ad849 · 1bbc6e05
Unverified Commit 823ad849 authored Nov 03, 2023 by Chen Xin Committed by GitHub Nov 03, 2023
17 changed files
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -28,8 +28,12 @@ class CLI(object):
            model_name (str): The name of the to-be-deployed model, such as
                llama-7b, llama-13b, vicuna-7b and etc.
            model_path (str): The directory path of the model
-            model_format (str): The format of the model, fb or hf. 'fb' stands
-                for META's llama format, and 'hf' means huggingface format.
+            model_format (str): the format of the model, should choose from
+                ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
+                format, 'hf' means huggingface llama format, and 'awq' means
+                llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
+                the default value is None, which means the model_format will be
+                inferred based on model_name
            tokenizer_path (str): The path of tokenizer model.
            dst_path (str): The destination path that saves outputs.
            tp (int): The number of GPUs used for tensor parallelism, which
@@ -38,7 +42,7 @@ class CLI(object):
            group_size (int): A parameter used in AWQ to quantize fp16 weights
                to 4 bits.
        """
-        from lmdeploy.serve.turbomind.deploy import main as convert
+        from lmdeploy.turbomind.deploy.converter import main as convert

        convert(model_name,
                model_path,

--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
--- a/lmdeploy/turbomind/deploy/__init__.py
+++ b/lmdeploy/turbomind/deploy/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
--- a/lmdeploy/turbomind/deploy/converter.py
+++ b/lmdeploy/turbomind/deploy/converter.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import re
+import shutil
+from pathlib import Path
+
+import fire
+
+from lmdeploy.model import MODELS
+
+from .source_model.base import INPUT_MODELS
+from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig
+
+supported_formats = ['llama', 'hf', 'awq', None]
+special_input_model_map = {
+    'qwen': 'qwen',
+    'baichuan': 'baichuan',
+    'baichuan2': 'baichuan2'
+}
+
+
+def get_package_root_path():
+    """Get lmdeploy root path."""
+    import lmdeploy
+    return Path(lmdeploy.__file__).parent
+
+
+def get_tokenizer_path(model_path: str, tokenizer_path: str):
+    """Get tokenizer path if not given."""
+    if tokenizer_path is not None:
+        assert osp.exists(tokenizer_path), f'{tokenizer_path} does not exists.'
+        return tokenizer_path
+    candidate = ['tokenizer.model', 'qwen.tiktoken']
+    for name in candidate:
+        tmp_path = osp.join(model_path, name)
+        if osp.exists(tmp_path):
+            tokenizer_path = tmp_path
+            break
+    assert tokenizer_path, 'please supply tokenizer path by --tokenizer-path'
+    return tokenizer_path
+
+
+def get_model_format(model_name: str, model_format: str):
+    """Get model format if not given or equal awq."""
+    # get model name prefix
+    if model_name.find('-') != -1:
+        model_name = model_name[:model_name.find('-')]
+    # rules:
+    # 1) llama -> match special -> hf (if not matched)
+    # 2) append awq (if model_format is awq)
+    inferred_model_format = model_format
+    if model_format in [None, 'hf']:
+        inferred_model_format = special_input_model_map.get(model_name, 'hf')
+    elif model_format == 'awq':
+        inferred_model_format = special_input_model_map.get(model_name,
+                                                            'hf') + '-awq'
+    return inferred_model_format
+
+
+def create_workspace(_path: str):
+    """Create a workspace.
+
+    Args:
+        _path (str): the path of the workspace
+    """
+    if osp.exists(_path):
+        print(f'remove workspace in directory {_path}')
+        shutil.rmtree(_path)
+    print(f'create workspace in directory {_path}')
+    os.makedirs(_path)
+
+
+def copy_triton_model_templates(_path: str):
+    """copy triton model templates to the specified path.
+
+    Args:
+        _path (str): the target path
+    Returns:
+        str: the path of the triton models
+    """
+
+    root = get_package_root_path()
+    dir_path = osp.join(root, 'serve', 'turbomind')
+    triton_models_path = osp.join(dir_path, 'triton_models')
+    dst_path = osp.join(_path, 'triton_models')
+    print(f'copy triton model templates from "{triton_models_path}" to '
+          f'"{dst_path}"')
+    shutil.copytree(triton_models_path, dst_path, symlinks=True)
+    service_docker_up_file = osp.join(dir_path, 'service_docker_up.sh')
+    print(f'copy service_docker_up.sh from "{service_docker_up_file}" to '
+          f'"{_path}"')
+    shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path)
+    return dst_path
+
+
+def copy_tokenizer(model_path: str, tokenizer_path: str,
+                   triton_models_path: str):
+    """Copy tokenizer."""
+    shutil.copy(
+        tokenizer_path,
+        osp.join(triton_models_path,
+                 osp.join('tokenizer', osp.basename(tokenizer_path))))
+    for _file in os.listdir(model_path):
+        if _file.endswith('.json') or _file.endswith('.py'):
+            json_path = osp.join(model_path, _file)
+            shutil.copy(json_path,
+                        osp.join(triton_models_path, 'tokenizer', _file))
+    with get_package_root_path() as root_path:
+        shutil.copy(osp.join(root_path, 'tokenizer.py'),
+                    osp.join(triton_models_path, 'tokenizer'))
+
+
+def pack_model_repository(workspace_path: str):
+    """package the model repository.
+
+    Args:
+        workspace_path: the path of workspace
+    """
+    os.symlink(src=osp.join('..', '..', 'tokenizer'),
+               dst=osp.join(workspace_path, 'triton_models', 'preprocessing',
+                            '1', 'tokenizer'))
+    os.symlink(src=osp.join('..', '..', 'tokenizer'),
+               dst=osp.join(workspace_path, 'triton_models', 'postprocessing',
+                            '1', 'tokenizer'))
+    os.symlink(src=osp.join('..', '..', 'weights'),
+               dst=osp.join(workspace_path, 'triton_models', 'interactive',
+                            '1', 'weights'))
+    model_repo_dir = osp.join(workspace_path, 'model_repository')
+    os.makedirs(model_repo_dir, exist_ok=True)
+    os.symlink(src=osp.join('..', 'triton_models', 'interactive'),
+               dst=osp.join(model_repo_dir, 'turbomind'))
+    os.symlink(src=osp.join('..', 'triton_models', 'preprocessing'),
+               dst=osp.join(model_repo_dir, 'preprocessing'))
+    os.symlink(src=osp.join('..', 'triton_models', 'postprocessing'),
+               dst=osp.join(model_repo_dir, 'postprocessing'))
+
+
+def main(model_name: str,
+         model_path: str,
+         model_format: str = None,
+         tokenizer_path: str = None,
+         dst_path: str = 'workspace',
+         tp: int = 1,
+         quant_path: str = None,
+         group_size: int = 0):
+    """deploy llama family models via turbomind.
+
+    Args:
+        model_name (str): the name of the to-be-deployed model, such as
+            llama-7b, llama-13b, vicuna-7b and etc
+        model_path (str): the directory path of the model
+        model_format (str): the format of the model, should choose from
+            ['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
+            format, 'hf' means huggingface llama format, and 'awq' means
+            llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
+            the default value is None, which means the model_format will be
+            inferred based on model_name
+        tokenizer_path (str): the path of tokenizer model
+        dst_path (str): the destination path that saves outputs
+        tp (int): the number of GPUs used for tensor parallelism, should be 2^n
+        quant_path (str): Path of the quantized model, which can be None.
+        group_size (int): a parameter used in AWQ to quantize fp16 weights
+            to 4 bits
+    """
+
+    assert model_name in MODELS.module_dict.keys(), \
+        f"'{model_name}' is not supported. " \
+        f'The supported models are: {MODELS.module_dict.keys()}'
+
+    assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
+
+    output_format = 'fp16'
+
+    # get input model format
+    assert model_format in supported_formats, 'the model format ' \
+        f'should be in {supported_formats}'
+
+    inferred_model_format = get_model_format(model_name, model_format)
+    if inferred_model_format not in INPUT_MODELS.module_dict.keys():
+        supported_keys = list(INPUT_MODELS.module_dict.keys())
+        print(f'with model name {model_name} and model formst {model_format}, '
+              f'the inferred model format is {inferred_model_format}, '
+              f'which is not in supported list {supported_keys}')
+        exit(-1)
+
+    # get tokenizer path
+    tokenizer_path = get_tokenizer_path(model_path, tokenizer_path)
+
+    # create workspace
+    create_workspace(dst_path)
+
+    triton_models_path = copy_triton_model_templates(dst_path)
+
+    copy_tokenizer(model_path, tokenizer_path, triton_models_path)
+
+    # turbomind config
+    cfg = TurbomindModelConfig.from_dict({}, allow_none=True)
+    cfg.model_name = model_name
+    cfg.tensor_para_size = tp
+    cfg.rotary_embedding = cfg.size_per_head
+    cfg.group_size = group_size
+    if inferred_model_format.find('awq') != -1:
+        cfg.weight_type = 'int4'
+        output_format = 'w4'
+        assert group_size > 0, 'group_size should > 0'
+
+    # convert
+    print('model_name            ', model_name)
+    print('model_format          ', model_format)
+    print('inferred_model_format ', inferred_model_format)
+    print('model_path            ', model_path)
+    print('tokenizer_path        ', tokenizer_path)
+    print('output_format         ', output_format)
+    weight_path = osp.join(triton_models_path, 'weights')
+    input_model = INPUT_MODELS.get(inferred_model_format)(
+        model_path=model_path,
+        tokenizer_path=tokenizer_path,
+        ckpt_path=quant_path)
+    output_model = OUTPUT_MODELS.get(output_format)(input_model=input_model,
+                                                    cfg=cfg,
+                                                    to_file=True,
+                                                    out_dir=weight_path)
+    output_model.export()
+
+    # update `tensor_para_size` in `triton_models/interactive/config.pbtxt`
+    with open(osp.join(triton_models_path, 'interactive', 'config.pbtxt'),
+              'a') as f:
+        param = \
+            'parameters {\n  key: "tensor_para_size"\n  value: {\n    ' \
+            'string_value: ' + f'"{tp}"\n' + '  }\n}\n' + \
+            'parameters {\n  key: "model_name"\n  value: {\n    ' \
+            'string_value: ' + f'"{model_name}"\n' + '  }\n}\n'
+        f.write(param)
+
+    # pack model repository for triton inference server
+    pack_model_repository(dst_path)
+
+    # update the value of $TP in `service_docker_up.sh`
+    file_path = osp.join(dst_path, 'service_docker_up.sh')
+    with open(file_path, 'r') as f:
+        content = f.read()
+        content = re.sub('TP=1', f'TP={tp}', content)
+    with open(file_path, 'w') as f:
+        f.write(content)
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
--- a/lmdeploy/turbomind/deploy/source_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/source_model/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .baichuan import Baichuan2Model, BaichuanModel  # noqa: F401
+from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel  # noqa: F401
+from .llama import LlamaModel  # noqa: F401
+from .llama_awq import LlamaAwqModel  # noqa: F401
+from .meta_llama import MetaLlamaModel  # noqa: F401
+from .qwen import QwenModel  # noqa: F401
+from .qwen_awq import QwenAwqModel  # noqa: F401
--- a/lmdeploy/turbomind/deploy/source_model/baichuan.py
+++ b/lmdeploy/turbomind/deploy/source_model/baichuan.py
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class BaichuanReader(LlamaReader):
+    """BaichuanReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+        """Get q, k, v, o kind for layer i."""
+        result = []
+        pack_key = f'model.layers.{i}.self_attn.W_pack.{kind}'
+        qkv = self.params[pack_key]
+        result.extend(torch.split(qkv, qkv.shape[size_dim] // 3, dim=dim))
+        o = self.params[f'model.layers.{i}.self_attn.o_proj.{kind}']
+        result.append(o)
+        return (*result, )
+
+    def attn(self, i: int):
+        """Get q, k, v, o weight for layer i."""
+        return self._attn(i, 'weight', 0, 0)
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        return (None, ) * 4
+
+
+class Baichuan2Reader(BaichuanReader):
+    """Baichuan2Reader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def output_weight(self):
+        """Get output."""
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
+        tensor = self.params.get('lm_head.weight', None)
+        if tensor is not None:
+            tensor = tensor.cuda()
+            tensor = torch.nn.functional.normalize(tensor)
+        return tensor
+
+
+@INPUT_MODELS.register_module(name='baichuan')
+class BaichuanModel(LlamaModel):
+    """Llama model in baichuan format."""
+
+    Reader = BaichuanReader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+
+
+@INPUT_MODELS.register_module(name='baichuan2')
+class Baichuan2Model(LlamaModel):
+    """Llama model in baichuan format."""
+
+    Reader = Baichuan2Reader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
+        super().__init__(model_path, tokenizer_path, **kwargs)
--- a/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
+++ b/lmdeploy/turbomind/deploy/source_model/baichuan_awq.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .baichuan import Baichuan2Model, BaichuanModel, BaichuanReader
+from .base import INPUT_MODELS
+from .llama_awq import ensure_fp16orint32
+
+
+class BaichuanAwqReader(BaichuanReader):
+    """BaichuanAwqReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def attn(self, i: int):
+        """Get q, k, v, o qweight for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o qzeros for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scales for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
+
+    def ffn(self, i: int):
+        """Get ffn qweight for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qweight'))
+
+    def ffn_zero(self, i: int):
+        """Get ffn qzeros for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
+
+    def ffn_scale(self, i: int):
+        """Get ffn scales for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'scales'))
+
+
+class Baichuan2AwqReader(BaichuanAwqReader):
+    """Baichuan2AwqReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def output_weight(self):
+        """Get output."""
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
+        tensor = self.params.get('lm_head.weight', None)
+        if tensor is not None:
+            tensor = tensor.cuda()
+            tensor = torch.nn.functional.normalize(tensor)
+        return tensor
+
+
+@INPUT_MODELS.register_module(name='baichuan-awq')
+class BaichuanAwqModel(BaichuanModel):
+    """Baichuan awq model in hf format."""
+
+    Reader = BaichuanAwqReader
+
+    def __init__(self,
+                 model_path: str,
+                 tokenizer_path: str,
+                 ckpt_path: str = None,
+                 **kwargs):
+        super().__init__(model_path,
+                         tokenizer_path,
+                         ckpt_path=ckpt_path,
+                         **kwargs)
+
+
+@INPUT_MODELS.register_module(name='baichuan2-awq')
+class Baichuan2AwqModel(Baichuan2Model):
+    """Baichuan2 awq model in hf format."""
+
+    Reader = Baichuan2AwqReader
+
+    def __init__(self,
+                 model_path: str,
+                 tokenizer_path: str,
+                 ckpt_path: str = None,
+                 **kwargs):
+        super().__init__(model_path,
+                         tokenizer_path,
+                         ckpt_path=ckpt_path,
+                         **kwargs)
--- a/lmdeploy/turbomind/deploy/source_model/base.py
+++ b/lmdeploy/turbomind/deploy/source_model/base.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+from abc import ABC, abstractmethod
+from typing import Dict, Iterator, Tuple, Union
+
+import torch
+from mmengine import Registry
+
+INPUT_MODELS = Registry(
+    'source model', locations=['lmdeploy.turbomind.deploy.source_model.base'])
+
+
+class BaseReader(ABC):
+    """Base checkpoint manager."""
+
+    def __init__(self):
+        pass
+
+    @property
+    @abstractmethod
+    def start_layer_id(self) -> int:
+        """Get the start transformer layer number."""
+        pass
+
+    @property
+    @abstractmethod
+    def end_layer_id(self) -> int:
+        """Get the end transformer layer number."""
+        pass
+
+    @abstractmethod
+    def init_layer_id(self) -> None:
+        """Get start and end transformer layer number."""
+        self._start_layer_id = -1
+        self._end_layer_id = -1
+        layer_count = {}
+        for key in self.params:
+            layer_id = re.findall(self.attn_layer_patten, key)
+            if len(layer_id) == 0:
+                continue
+            layer_id = int(layer_id[0])
+            if layer_id not in layer_count:
+                layer_count[layer_id] = 0
+            layer_count[layer_id] += 1
+        if len(layer_count) == 0:
+            return
+        if not (len(layer_count) > 1 or self.last_bin):
+            return
+        max_count = max([layer_count[layer_id] for layer_id in layer_count])
+        valid_layer_id = [
+            layer_id for layer_id in layer_count
+            if layer_count[layer_id] == max_count
+        ]
+        self._start_layer_id = min(valid_layer_id)
+        self._end_layer_id = max(valid_layer_id) + 1
+
+    @abstractmethod
+    def clean_up(self, last: bool) -> None:
+        """Clean up unused params."""
+        if last:
+            self.params.clear()
+        else:
+            to_remove = []
+            for key in self.params:
+                layer_id = re.findall(self.attn_layer_patten, key)
+                if len(layer_id) == 0:
+                    to_remove.append(key)
+                else:
+                    layer_id = int(layer_id[0])
+                    if layer_id < self.end_layer_id:
+                        to_remove.append(key)
+            for key in to_remove:
+                self.params.pop(key, None)
+        torch.cuda.empty_cache()
+
+    @abstractmethod
+    def tok_embeddings(self) -> Union[torch.Tensor, None]:
+        """Get embeddings."""
+        pass
+
+    @abstractmethod
+    def norm_weight(self) -> Union[torch.Tensor, None]:
+        """Get norm."""
+        pass
+
+    @abstractmethod
+    def output_weight(self) -> Union[torch.Tensor, None]:
+        """Get output."""
+        pass
+
+    @abstractmethod
+    def attn(self, i: int) -> Tuple[torch.Tensor]:
+        """Get q, k, v, o weight for layer i."""
+        pass
+
+    @abstractmethod
+    def attn_bias(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get q, k, v, o bias for layer i."""
+        pass
+
+    @abstractmethod
+    def attn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get q, k, v, o zero point for layer i."""
+        pass
+
+    @abstractmethod
+    def attn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get q, k, v, o scale for layer i."""
+        pass
+
+    @abstractmethod
+    def attn_norm(self, i: int) -> torch.Tensor:
+        """Get attn norm for layer i."""
+        pass
+
+    @abstractmethod
+    def ffn(self, i: int) -> Tuple[torch.Tensor]:
+        """Get ffn weight for layer i."""
+        pass
+
+    @abstractmethod
+    def ffn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get ffn zero point for layer i."""
+        pass
+
+    @abstractmethod
+    def ffn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
+        """Get ffn scale for layer i."""
+        pass
+
+    @abstractmethod
+    def ffn_norm(self, i: int) -> torch.Tensor:
+        """Get ffn norm for layer i."""
+        pass
+
+
+class BaseInputModel(ABC):
+    """Base class for input model."""
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        """Constructor for BaseInputModel.
+
+        Args:
+            model_path (str): the path of the model.
+            tokenizer_path (str): the path of the tokenizer model.
+        """
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path
+
+    @property
+    @abstractmethod
+    def nmgrs(self) -> int:
+        """Get number of checkpoint."""
+        pass
+
+    @abstractmethod
+    def get_mgrs(self) -> Iterator[BaseReader]:
+        """Conctruct all BaseReader."""
+        pass
+
+    @abstractmethod
+    def tokenizer_info(self):
+        """Read tokenizer info."""
+        pass
+
+    @abstractmethod
+    def model_info(self) -> Dict:
+        """Read model info."""
+        pass
+
+    def bins(self) -> Iterator[BaseReader]:
+        """Get Reader."""
+        for mgr in self.get_mgrs():
+            yield mgr
--- a/lmdeploy/turbomind/deploy/source_model/llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+
+import torch
+from safetensors.torch import load_file
+from sentencepiece import SentencePieceProcessor
+
+from lmdeploy.tokenizer import Tokenizer
+
+from .base import INPUT_MODELS, BaseInputModel, BaseReader
+
+
+class LlamaReader(BaseReader):
+    """LlamaReader."""
+
+    attn_layer_patten = r'model.layers.([0-9]+).'
+    tok_embeddings_key = 'model.embed_tokens.weight'
+    norm_weight_key = 'model.norm.weight'
+    output_weight_key = 'lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__()
+        self.params = unused_params
+        self.params.update(new_params)
+        self.last_bin = last_bin
+        self.init_layer_id()
+
+    def init_layer_id(self):
+        """Get start/end transformer layer id."""
+        super().init_layer_id()
+
+    def clean_up(self, last: bool) -> None:
+        """Clean up unused params."""
+        super().clean_up(last)
+
+    @property
+    def start_layer_id(self):
+        """Get start transformer layer id."""
+        return self._start_layer_id
+
+    @property
+    def end_layer_id(self):
+        """Get end transformer layer id."""
+        return self._end_layer_id
+
+    def tok_embeddings(self):
+        """Get embeddings."""
+        return self.params.get(self.tok_embeddings_key, None)
+
+    def norm_weight(self):
+        """Get norm."""
+        return self.params.get(self.norm_weight_key, None)
+
+    def output_weight(self):
+        """Get output."""
+        return self.params.get(self.output_weight_key, None)
+
+    def _attn(self, i: int, kind: str, allow_none=False):
+        """Get q, k, v, o kind for layer i."""
+        result = []
+        for key in ['q', 'k', 'v', 'o']:
+            tensor = self.params.get(
+                f'model.layers.{i}.self_attn.{key}_proj.{kind}')
+            if not allow_none:
+                assert tensor is not None
+            result.append(tensor)
+        return (*result, )
+
+    def attn(self, i: int):
+        """Get q, k, v, o weight for layer i."""
+        return self._attn(i, 'weight')
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        return self._attn(i, 'bias', allow_none=True)
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o zero point for layer i."""
+        return (None, ) * 4
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scale for layer i."""
+        return (None, ) * 4
+
+    def attn_norm(self, i: int):
+        """Get attn norm for layer i."""
+        return self.params[f'model.layers.{i}.input_layernorm.weight']
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        result = []
+        for key in ['gate', 'down', 'up']:
+            tensor = self.params[f'model.layers.{i}.mlp.{key}_proj.{kind}']
+            result.append(tensor)
+        return (*result, )
+
+    def ffn(self, i: int):
+        """Get ffn weight for layer i."""
+        return self._ffn(i, 'weight')
+
+    def ffn_zero(self, i: int):
+        """Get ffn zero point for layer i."""
+        return (None, ) * 3
+
+    def ffn_scale(self, i: int):
+        """Get ffn scale for layer i."""
+        return (None, ) * 3
+
+    def ffn_norm(self, i: int):
+        """Get ffn norm for layer i."""
+        return self.params[f'model.layers.{i}.post_attention_layernorm.weight']
+
+
+@INPUT_MODELS.register_module(name='hf')
+class LlamaModel(BaseInputModel):
+    """Llama model in hf format."""
+
+    Reader = LlamaReader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
+        super().__init__(model_path, tokenizer_path)
+        ckpt_path = kwargs.get('ckpt_path')
+        if ckpt_path is None:
+            ckpt_path = model_path
+        self.ckpt_path = ckpt_path
+        self.ckpt_files = self.get_ckpt()
+
+    def get_ckpt(self):
+        """Get weight files."""
+        suffixes = ['.safetensors', '.bin']
+        files = []
+        for suffix in suffixes:
+            files = [
+                file for file in os.listdir(self.ckpt_path)
+                if file.endswith(suffix)
+            ]
+            if len(files) > 0:
+                break
+        files = sorted(files)
+        return files
+
+    @property
+    def nmgrs(self):
+        """Get number of checkpoint."""
+        return len(self.ckpt_files)
+
+    def get_mgrs(self):
+        """Conctruct all Reader."""
+        assert self.nmgrs > 0, \
+            f'could not find checkpoints in {self.ckpt_path}'
+        unused_params = {}
+        try:
+            for i, ckpt in enumerate(self.ckpt_files):
+                is_last_bin = i == len(self.ckpt_files) - 1
+                if ckpt.endswith('.bin'):
+                    new_params = torch.load(osp.join(self.ckpt_path, ckpt),
+                                            map_location='cpu')
+                else:
+                    new_params = load_file(osp.join(self.ckpt_path, ckpt))
+                ret = self.Reader(new_params, unused_params,
+                                  i == self.nmgrs - 1)
+                yield ret
+                ret.clean_up(is_last_bin)
+        except GeneratorExit:
+            ret.clean_up(True)
+
+    def tokenizer_info(self):
+        """Read tokenizer info."""
+        assert osp.isfile(self.tokenizer_path), self.tokenizer_path
+        try:
+            tk_model = SentencePieceProcessor(model_file=self.tokenizer_path)
+            # BOS / EOS token IDs
+            n_words = tk_model.vocab_size
+            bos_id = tk_model.bos_token_id
+            eos_id = tk_model.eos_token_id
+        except Exception:
+            tk_model = Tokenizer(self.model_path)
+            n_words = tk_model.vocab_size
+            bos_id = tk_model.bos_token_id
+            eos_id = tk_model.eos_token_id
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        """Read model info."""
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            model_arg = json.load(f)
+            num_layer = model_arg['num_hidden_layers']
+            norm_eps = model_arg['rms_norm_eps']
+            if 'num_key_value_heads' in model_arg:
+                kv_head_num = model_arg['num_key_value_heads']
+            else:
+                kv_head_num = model_arg['num_attention_heads']
+            rope_theta = float(model_arg.get('rope_theta', 10000.0))
+            max_position_embeddings = int(
+                model_arg.get('max_position_embeddings', 0))
+            repo_scaling = bool(model_arg.get('rope_scaling', False))
+
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    kv_head_num=kv_head_num,
+                    rope_theta=rope_theta,
+                    max_position_embeddings=max_position_embeddings,
+                    use_dynamic_ntk=int(repo_scaling))
--- a/lmdeploy/turbomind/deploy/source_model/llama_awq.py
+++ b/lmdeploy/turbomind/deploy/source_model/llama_awq.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+def ensure_fp16orint32(tensors: torch.Tensor):
+    """Ensure tensors in fp16/int32 format."""
+    result = []
+    for tensor in tensors:
+        if tensor is not None:
+            if tensor.dtype in [torch.float16, torch.float32, torch.bfloat16]:
+                result.append(tensor.half())
+            else:
+                assert tensor.dtype == torch.int32
+                result.append(tensor)
+        else:
+            result.append(None)
+    return (*result, )
+
+
+class LlamaAwqReader(LlamaReader):
+    """LlamaAwqReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def attn(self, i: int):
+        """Get q, k, v, o qweight for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qweight'))
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o qzeros for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qzeros'))
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scales for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'scales'))
+
+    def ffn(self, i: int):
+        """Get ffn qweight for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qweight'))
+
+    def ffn_zero(self, i: int):
+        """Get ffn qzeros for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
+
+    def ffn_scale(self, i: int):
+        """Get ffn scales for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'scales'))
+
+
+@INPUT_MODELS.register_module(name='hf-awq')
+class LlamaAwqModel(LlamaModel):
+    """Llama Awq model in hf format."""
+
+    Reader = LlamaAwqReader
+
+    def __init__(self,
+                 model_path: str,
+                 tokenizer_path: str,
+                 ckpt_path: str = None,
+                 **kwargs):
+        super().__init__(model_path,
+                         tokenizer_path,
+                         ckpt_path=ckpt_path,
+                         **kwargs)
--- a/lmdeploy/turbomind/deploy/source_model/meta_llama.py
+++ b/lmdeploy/turbomind/deploy/source_model/meta_llama.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from pathlib import Path
+
+import torch
+from sentencepiece import SentencePieceProcessor
+
+from .base import INPUT_MODELS, BaseInputModel, BaseReader
+
+
+def reverse_permute(x: torch.Tensor, size_per_head: int = 128):
+    """reverse permute to hf format."""
+    if x.shape[-1] > 1:
+        dim = x.shape[-1]
+        n_heads = dim // size_per_head
+        return x.view(-1, n_heads, dim // n_heads // 2,
+                      2).transpose(2, 3).reshape(-1, dim)
+    else:  # scales, zeros
+        dim = x.shape[0]
+        n_heads = dim // size_per_head
+        return x.view(n_heads, dim // n_heads // 2, 2,
+                      1).transpose(1, 2).reshape(dim, 1)
+
+
+class MetaLlamaReader(BaseReader):
+    """MetaLlamaReader."""
+
+    def __init__(self, model_path: str, start_layer_id: int,
+                 end_layer_id: int):
+        super().__init__()
+        self._start_layer_id = start_layer_id
+        self._end_layer_id = end_layer_id
+        self.params = self.load_model(model_path)
+
+    def init_layer_id(self):
+        """Empty."""
+        pass
+
+    def load_model(self, model_path):
+        """Load all parameters."""
+        checkpoints = []
+        for pattern in ['*.pth', '*.pt']:
+            checkpoints += sorted(Path(model_path).glob(pattern))
+        n_ckpt = len(checkpoints)
+        model_params = {}
+
+        def get_param(_name, _size):
+            if _name not in model_params:
+                model_params[_name] = torch.zeros(_size,
+                                                  dtype=torch.float16,
+                                                  device='cpu')
+            return model_params[_name]
+
+        from tqdm import tqdm
+        pbar = tqdm(total=n_ckpt, desc='load meta ckpt', leave=False)
+        for i, ckpt_path in enumerate(checkpoints):
+            ckpt = torch.load(ckpt_path, map_location='cpu')
+            for param_name, param_data in ckpt.items():
+                key, ext = param_name.split('.')[-2:]
+                # column-parallel
+                if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:
+                    size = param_data.size(0)
+                    if ext == 'weight':
+                        param = get_param(
+                            param_name,
+                            [size * n_ckpt, param_data.size(1)])
+                        param.data[size * i:size * (i + 1), :] = param_data
+                    else:  # bias
+                        param = get_param(param_name, [size * n_ckpt])
+                        param.data[size * i:size * (i + 1)] = param_data
+                # row-parallel
+                elif key in ['w2', 'wo', 'tok_embeddings']:
+                    size = param_data.size(-1)
+                    if ext == 'weight':
+                        param = get_param(param_name,
+                                          [param_data.size(0), size * n_ckpt])
+                        param.data[:, size * i:size * (i + 1)] = param_data
+                    else:  # bias
+                        param = get_param(param_name, [size])
+                        param.data = param_data
+                elif i == 0:
+                    param = get_param(param_name, param_data.size())
+                    param.data = param_data
+            del ckpt
+            pbar.update(1)
+        pbar.close()
+
+        for name, param in model_params.items():
+            # transpose all weights as TurboMind is expecting column-major
+            # (output_dims, input_dims) -> (input_dims, output_dims)
+            key = name.split('.')[-2]
+            if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
+                param.data = param.data.t()
+                if key in ['wq', 'wk']:
+                    param.data = reverse_permute(param.data)
+        return model_params
+
+    def clean_up(self, last: bool) -> None:
+        """Clean up unused params."""
+        self.params.clear()
+
+    @property
+    def start_layer_id(self):
+        """Get start transformer layer id."""
+        return self._start_layer_id
+
+    @property
+    def end_layer_id(self):
+        """Get end transformer layer id."""
+        return self._end_layer_id
+
+    def tok_embeddings(self):
+        """Get embeddings."""
+        return self.params.get('tok_embeddings.weight')
+
+    def norm_weight(self):
+        """Get norm."""
+        return self.params.get('norm.weight')
+
+    def output_weight(self):
+        """Get output."""
+        return self.params.get('output.weight')
+
+    def attn(self, i: int):
+        """Get q, k, v, o weight for layer i."""
+        result = []
+        for key in ['wq', 'wk', 'wv', 'wo']:
+            tensor = self.params[f'layers.{i}.attention.{key}.weight']
+            tensor = tensor.t() if tensor is not None else None
+            result.append(tensor)
+        return (*result, )
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        result = []
+        for key in ['wq', 'wk', 'wv', 'wo']:
+            tensor = self.params.get(f'layers.{i}.attention.{key}.bias')
+            tensor = tensor.t() if tensor is not None else None
+            result.append(tensor)
+        return (*result, )
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o zero point for layer i."""
+        return (None, ) * 4
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scale for layer i."""
+        return (None, ) * 4
+
+    def attn_norm(self, i: int):
+        """Get attn norm for layer i."""
+        return self.params[f'layers.{i}.attention_norm.weight']
+
+    def ffn(self, i: int):
+        """Get ffn weight for layer i."""
+        result = []
+        for key in ['w1', 'w2', 'w3']:
+            tensor = self.params[f'layers.{i}.feed_forward.{key}.weight']
+            result.append(tensor.t())
+        return (*result, )
+
+    def ffn_zero(self, i: int):
+        """Get ffn zero point for layer i."""
+        return (None, ) * 3
+
+    def ffn_scale(self, i: int):
+        """Get ffn scale for layer i."""
+        return (None, ) * 3
+
+    def ffn_norm(self, i: int):
+        """Get ffn norm for layer i."""
+        return self.params[f'layers.{i}.ffn_norm.weight']
+
+
+@INPUT_MODELS.register_module(name='llama')
+class MetaLlamaModel(BaseInputModel):
+    """Llama model in fb format."""
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+
+    @property
+    def nmgrs(self):
+        """Get number of checkpoint."""
+        return 1
+
+    def get_mgrs(self):
+        """Conctruct all BaseReader."""
+        end_layer_id = self.model_info()['num_layer']
+        try:
+            if hasattr(self, 'meta_reader'):
+                yield self.meta_reader
+            else:
+                self.meta_reader = MetaLlamaReader(self.model_path, 0,
+                                                   end_layer_id)
+                yield self.meta_reader
+        except GeneratorExit:
+            pass
+
+    def tokenizer_info(self):
+        """Read tokenizer info."""
+        assert osp.isfile(self.tokenizer_path), self.tokenizer_path
+        sp_model = SentencePieceProcessor(model_file=self.tokenizer_path)
+        # BOS / EOS token IDs
+        n_words = sp_model.vocab_size()
+        bos_id = sp_model.bos_id()
+        eos_id = sp_model.eos_id()
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        """Read model info."""
+        params_path = osp.join(self.model_path, 'params.json')
+        with open(params_path) as f:
+            model_arg = json.load(f)
+            num_layer = model_arg['n_layers']
+            norm_eps = model_arg['norm_eps']
+            head_num = model_arg.get('n_heads', 32)
+            kv_head_num = model_arg.get('n_kv_heads', head_num)
+
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    head_num=head_num,
+                    kv_head_num=kv_head_num)
--- a/lmdeploy/turbomind/deploy/source_model/qwen.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+
+import torch
+
+from .base import INPUT_MODELS
+from .llama import LlamaModel, LlamaReader
+
+
+class QwenReader(LlamaReader):
+    """QwenReader."""
+
+    attn_layer_patten = r'transformer.h.([0-9]+).'
+    tok_embeddings_key = 'transformer.wte.weight'
+    norm_weight_key = 'transformer.ln_f.weight'
+    output_weight_key = 'lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
+        """Get q, k, v, o kind for layer i."""
+        qkv = self.params[f'transformer.h.{i}.attn.c_attn.{kind}']
+        q, k, v = torch.split(qkv, qkv.size(size_dim) // 3, dim=dim)
+        o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}', None)
+        if o is None:
+            o = torch.zeros_like(q)
+        return q, k, v, o
+
+    def attn(self, i: int):
+        """Get q, k, v, o weight for layer i."""
+        return self._attn(i, 'weight', 0, 0)
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        return self._attn(i, 'bias', -1, 0)
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o zero point for layer i."""
+        return (None, ) * 4
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scale for layer i."""
+        return (None, ) * 4
+
+    def attn_norm(self, i: int):
+        """Get attn norm for layer i."""
+        return self.params[f'transformer.h.{i}.ln_1.weight']
+
+    def _ffn(self, i: int, kind: str):
+        """Get ffn kind for layer i."""
+        result = []
+        for key in ['w2', 'c_proj', 'w1']:
+            tensor = self.params[f'transformer.h.{i}.mlp.{key}.{kind}']
+            result.append(tensor)
+        return (*result, )
+
+    def ffn(self, i: int):
+        """Get ffn weight for layer i."""
+        return self._ffn(i, 'weight')
+
+    def ffn_zero(self, i: int):
+        """Get ffn zero point for layer i."""
+        return (None, ) * 3
+
+    def ffn_scale(self, i: int):
+        """Get ffn scale for layer i."""
+        return (None, ) * 3
+
+    def ffn_norm(self, i: int):
+        """Get ffn norm for layer i."""
+        return self.params[f'transformer.h.{i}.ln_2.weight']
+
+
+@INPUT_MODELS.register_module(name='qwen')
+class QwenModel(LlamaModel):
+    """Qwen model in hf format."""
+
+    Reader = QwenReader
+
+    def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
+        super().__init__(model_path, tokenizer_path, **kwargs)
+
+    def tokenizer_info(self):
+        """Read tokenizer info."""
+        n_words = 151851
+        bos_id = 0
+        eos_id = 151643
+        return n_words, bos_id, eos_id
+
+    def model_info(self):
+        """Read model info."""
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            config = json.load(f)
+            num_layer = config['num_hidden_layers']
+            norm_eps = config['layer_norm_epsilon']
+            rope_theta = float(config.get('rotary_emb_base', 10000.0))
+            if 'num_key_value_heads' in config:
+                kv_head_num = config['num_key_value_heads']
+            else:
+                kv_head_num = config['num_attention_heads']
+            seq_length = config['seq_length']
+            use_dynamic_ntk = int(config['use_dynamic_ntk'])
+            use_logn_attn = int(config['use_logn_attn'])
+        return dict(num_layer=num_layer,
+                    norm_eps=norm_eps,
+                    kv_head_num=kv_head_num,
+                    rope_theta=rope_theta,
+                    max_position_embeddings=seq_length,
+                    use_dynamic_ntk=int(use_dynamic_ntk),
+                    use_logn_attn=use_logn_attn)
--- a/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
+++ b/lmdeploy/turbomind/deploy/source_model/qwen_awq.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import INPUT_MODELS
+from .llama_awq import ensure_fp16orint32
+from .qwen import QwenModel, QwenReader
+
+
+class QwenAwqReader(QwenReader):
+    """QwenAwqReader."""
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
+        super().__init__(new_params, unused_params, last_bin)
+
+    def attn(self, i: int):
+        """Get q, k, v, o qweight for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
+
+    def attn_bias(self, i: int):
+        """Get q, k, v, o bias for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'bias', -1, 0))
+
+    def attn_zero(self, i: int):
+        """Get q, k, v, o qzeros for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
+
+    def attn_scale(self, i: int):
+        """Get q, k, v, o scales for layer i."""
+        return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
+
+    def ffn(self, i: int):
+        """Get ffn qweight for layer i."""
+        # ours: w2(silu(w1(x)) * w3(x))
+        # qwen: c_proj(w1(x) * silu(w2(x)))
+        return ensure_fp16orint32(self._ffn(i, 'qweight'))
+
+    def ffn_zero(self, i: int):
+        """Get ffn qzeros for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'qzeros'))
+
+    def ffn_scale(self, i: int):
+        """Get ffn scales for layer i."""
+        return ensure_fp16orint32(self._ffn(i, 'scales'))
+
+
+@INPUT_MODELS.register_module(name='qwen-awq')
+class QwenAwqModel(QwenModel):
+    """Qwen awq model in hf format."""
+
+    Reader = QwenAwqReader
+
+    def __init__(self,
+                 model_path: str,
+                 tokenizer_path: str,
+                 ckpt_path: str = None,
+                 **kwargs):
+        super().__init__(model_path,
+                         tokenizer_path,
+                         ckpt_path=ckpt_path,
+                         **kwargs)
--- a/lmdeploy/turbomind/deploy/target_model/__init__.py
+++ b/lmdeploy/turbomind/deploy/target_model/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .fp import TurbomindModel  # noqa: F401
+from .w4 import TurbomindW4Model  # noqa: F401
--- a/lmdeploy/turbomind/deploy/target_model/base.py
+++ b/lmdeploy/turbomind/deploy/target_model/base.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import configparser
+import inspect
+import os.path as osp
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import torch
+import tqdm
+from mmengine import Registry
+
+from lmdeploy.model import MODELS
+
+from ..source_model.base import BaseInputModel, BaseReader
+
+OUTPUT_MODELS = Registry(
+    'target model', locations=['lmdeploy.turbomind.deploy.target_model.base'])
+
+
+def tprint(*args, **kwargs):
+    from io import StringIO
+    s = StringIO()
+    print(*args, **kwargs, file=s, end='')
+    tqdm.tqdm.write(s.getvalue())
+
+
+@dataclass
+class TurbomindModelConfig:
+    """Config for turbomind model."""
+    model_name: str
+    tensor_para_size: int
+    head_num: int
+    kv_head_num: int
+    vocab_size: int
+    num_layer: int
+    inter_size: int
+    norm_eps: float
+    attn_bias: int
+    start_id: int
+    end_id: int
+    session_len: int
+    weight_type: str = 'fp16'
+    rotary_embedding: int = 128
+    rope_theta: float = 10000.0
+    size_per_head: int = 128
+    group_size: int = 0
+    max_batch_size: int = 32
+    max_context_token_num: int = 4
+    step_length: int = 1
+    cache_max_entry_count: int = 48
+    cache_chunk_size: int = 1
+    use_context_fmha: int = 1
+    quant_policy: int = 0
+    max_position_embeddings: int = 0
+    use_dynamic_ntk: int = 0
+    use_logn_attn: int = 0
+
+    @classmethod
+    def from_dict(cls, env, allow_none=False):
+        """Construct from dict."""
+        params = inspect.signature(cls).parameters
+        used = {k: v for k, v in env.items() if k in params and v is not None}
+        if not allow_none:
+            return cls(**used)
+        else:
+            default = {
+                k: None
+                for k in params.keys() if params[k].default is inspect._empty
+            }
+            default.update(used)
+            return cls(**default)
+
+    @property
+    def valid(self):
+        """Check if cfg is valid."""
+        for _, v in self.__dict__.items():
+            if v is None:
+                return False
+        return True
+
+
+class BaseOutputModel(ABC):
+    """Base output model."""
+
+    def __init__(self,
+                 input_model: BaseInputModel,
+                 cfg: TurbomindModelConfig,
+                 to_file: bool = True,
+                 out_dir: str = ''):
+        super().__init__()
+        self.input_model = input_model
+        self.cfg = self.get_config(cfg)
+        assert self.cfg.valid
+        self.to_file = to_file
+        self.out_dir = out_dir
+
+    @abstractmethod
+    def get_config(self, cfg: TurbomindModelConfig) -> TurbomindModelConfig:
+        """Generate turbomind model config (config.ini)."""
+        _, bos_id, eos_id = self.input_model.tokenizer_info()
+        model = MODELS.get(cfg.model_name)()
+        final_cfg = cfg.__dict__
+        final_cfg.update(
+            dict(start_id=bos_id,
+                 end_id=eos_id,
+                 session_len=model.session_len + 8))
+        final_cfg.update(self.input_model.model_info())
+
+        # head_num, vocab_size
+        for bin in self.input_model.bins():
+            emb = bin.tok_embeddings()
+            if emb is not None:
+                _vocab_size, dim = emb.shape
+                head_num = dim // cfg.size_per_head
+                break
+        final_cfg.update(dict(head_num=head_num, vocab_size=_vocab_size))
+        return TurbomindModelConfig.from_dict(final_cfg, allow_none=True)
+
+    def export_config(self) -> None:
+        """export turbomind config."""
+        if self.to_file:
+            config = configparser.ConfigParser()
+            cfg = dict(llama=self.cfg.__dict__)
+            for section, key_values in cfg.items():
+                config[section] = key_values
+            config_path = osp.join(self.out_dir, 'config.ini')
+            with open(config_path, 'w') as f:
+                config.write(f)
+
+    def export_weight(self, param: torch.Tensor, name: str) -> None:
+        """export turbomind weight."""
+        if self.to_file:
+            if param.dtype in [torch.float, torch.bfloat16]:
+                param = param.half()
+            tprint(name, param.shape)
+            param.contiguous().cpu().numpy().tofile(
+                osp.join(self.out_dir, name))
+
+    def save_split(self,
+                   tensor: torch.Tensor,
+                   name: str,
+                   split_dim=None,
+                   copy=False) -> None:
+        """save split."""
+        tp = self.cfg.tensor_para_size
+        if split_dim is not None:
+            tprint(f'*** splitting {name}, shape={tensor.shape}, '
+                   f'split_dim={split_dim}, tp={tp}')
+            assert tensor.shape[split_dim] % tp == 0
+            split_size = tensor.shape[split_dim] // tp
+            splits = torch.split(tensor, split_size, dim=split_dim)
+            for i, split in enumerate(splits):
+                prefix, ext = osp.splitext(name)
+                self.export_weight(split, f'{prefix}.{i}{ext}')
+        elif copy:
+            tprint(f'### copying {name}, shape={tensor.shape}')
+            copies = [tensor] * tp
+            for i, copy in enumerate(copies):
+                prefix, ext = osp.splitext(name)
+                self.export_weight(copy, f'{prefix}.{i}{ext}')
+        else:
+            self.export_weight(tensor, name)
+
+    def export(self) -> None:
+        """Export to turbomind model format."""
+        num_layer = self.cfg.num_layer
+        from tqdm import tqdm
+        pbar = tqdm(total=num_layer, desc='Convert to turbomind format')
+        self.export_config()
+        for bin in self.input_model.bins():
+            self.export_misc(bin)
+            for i in range(bin.start_layer_id, bin.end_layer_id):
+                self.export_transformer_block(bin, i)
+                pbar.update(1)
+        pbar.close()
+        # manually clean up meta reader
+        if hasattr(self.input_model, 'meta_reader'):
+            self.input_model.meta_reader.clean_up(True)
+            del self.input_model.meta_reader
+            torch.cuda.empty_cache()
+
+    def export_misc(self, bin: BaseReader) -> None:
+        """Export embedding, norm, output weight."""
+        emb = bin.tok_embeddings()
+        norm_weight = bin.norm_weight()
+        output_weight = bin.output_weight()
+
+        def pad_weight(tensor):
+            pad_size = None
+            vocab_size = self.cfg.vocab_size
+            tp = self.cfg.tensor_para_size
+            if vocab_size % tp != 0:
+                pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size
+
+            if pad_size is None:
+                return tensor
+            return torch.nn.functional.pad(tensor, (0, 0, 0, pad_size),
+                                           'constant', 0)
+
+        if emb is not None:
+            emb = pad_weight(emb)
+            self.export_weight(emb, 'tok_embeddings.weight')
+        if norm_weight is not None:
+            self.export_weight(norm_weight, 'norm.weight')
+        if output_weight is not None:
+            output_weight = pad_weight(output_weight)
+            self.export_weight(output_weight, 'output.weight')
+
+    @abstractmethod
+    def export_transformer_block(self, bin: BaseReader, i: int) -> None:
+        """Export transformer block."""
+        pass
+
+
+def permute(x: torch.Tensor, size_per_head: int = 128):
+    if x.shape[-1] > 1:
+        dim = x.shape[-1]
+        n_heads = dim // size_per_head
+        return x.view(-1, n_heads, 2,
+                      dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
+    else:  # scales, zeros
+        dim = x.shape[0]
+        n_heads = dim // size_per_head
+        return x.view(n_heads, 2, dim // n_heads // 2,
+                      1).transpose(1, 2).reshape(dim, 1)
+
+
+def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
+              dim: int):
+
+    def reshape(x):
+        return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1)
+
+    qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1)
+    # (input_dim, head_num + 2 * kv_head_num)
+    return qkv.view(q.size(0), -1)
--- a/lmdeploy/turbomind/deploy/target_model/fp.py
+++ b/lmdeploy/turbomind/deploy/target_model/fp.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+
+from ..source_model.base import BaseInputModel, BaseReader
+from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
+                   merge_qkv, permute)
+
+
+def transpose_tensor(input: List[torch.Tensor]):
+    """Transpose tensor."""
+    output = [x.cuda().t() for x in input]
+    return output
+
+
+@OUTPUT_MODELS.register_module(name='fp16')
+class TurbomindModel(BaseOutputModel):
+    """Export to turbomind fp16 format."""
+
+    def __init__(self,
+                 input_model: BaseInputModel,
+                 cfg: TurbomindModelConfig,
+                 to_file: bool = True,
+                 out_dir: str = ''):
+        super().__init__(input_model, cfg, to_file, out_dir)
+
+    def get_config(self, cfg: TurbomindModelConfig):
+        """Get turbomind config."""
+        final_cfg = super().get_config(cfg).__dict__
+
+        # attn_bias, inter_size
+        visit = False
+        attn_bias = 0
+        for bin in self.input_model.bins():
+            for i in range(bin.start_layer_id, bin.end_layer_id):
+                visit = True
+                w1, _, _ = bin.ffn(i)
+                inter_size = w1.t().shape[-1]
+                qb, _, _, _ = bin.attn_bias(i)
+                if qb is not None:
+                    attn_bias = 1
+                break
+            if visit:
+                break
+        final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
+        return TurbomindModelConfig.from_dict(final_cfg)
+
+    def export_transformer_block(self, bin: BaseReader, i: int):
+        """Export transformer layer i."""
+        assert bin.start_layer_id <= i < bin.end_layer_id
+        tp = self.cfg.tensor_para_size
+        size_per_head = self.cfg.size_per_head
+        # attn
+        qw, kw, vw, ow = bin.attn(i)
+        qw, kw, vw, ow = transpose_tensor([qw, kw, vw, ow])
+        qw = permute(qw, size_per_head)
+        kw = permute(kw, size_per_head)
+        qkv_w = merge_qkv(qw, kw, vw, tp, dim=2)
+        self.save_split(qkv_w, f'layers.{i}.attention.w_qkv.weight', -1)
+        self.save_split(ow, f'layers.{i}.attention.wo.weight', 0)
+        qb, kb, vb, ob = bin.attn_bias(i)
+        if qb is not None:
+            qb, kb, vb, ob = transpose_tensor([qb, kb, vb, ob])
+            qb = permute(qb, size_per_head)
+            kb = permute(kb, size_per_head)
+            qkv_b = merge_qkv(qb, kb, vb, tp, dim=1)
+            self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
+            self.save_split(ob, f'layers.{i}.attention.wo.bias', copy=True)
+        # ffn
+        w1, w2, w3 = bin.ffn(i)
+        w1, w2, w3 = transpose_tensor([w1, w2, w3])
+        self.save_split(w1, f'layers.{i}.feed_forward.w1.weight', -1)
+        self.save_split(w3, f'layers.{i}.feed_forward.w3.weight', -1)
+        self.save_split(w2, f'layers.{i}.feed_forward.w2.weight', 0)
+        # norm
+        attn_norm = bin.attn_norm(i)
+        ffn_norm = bin.ffn_norm(i)
+        self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
+        self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
--- a/lmdeploy/turbomind/deploy/target_model/w4.py
+++ b/lmdeploy/turbomind/deploy/target_model/w4.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import sys
+
+import torch
+
+import lmdeploy
+
+from ..source_model.base import BaseInputModel, BaseReader
+from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
+                   merge_qkv, permute)
+
+# import _turbomind as _tm
+# TODO: find another way import _turbomind
+lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
+sys.path.append(osp.join(lmdeploy_dir, 'lib'))
+import _turbomind as _tm  # noqa: E402
+
+
+def transpose_qk_s4(src: torch.Tensor, group_size):
+    assert src.is_contiguous()
+    dst = torch.zeros_like(src)
+    _tm.transpose_qk_s4_k_m8(src, dst,
+                             src.size(-1) * 8, src.size(0), group_size)
+    return dst
+
+
+def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor, w1_s: torch.Tensor,
+                  w3_qw: torch.Tensor, w3_qz: torch.Tensor,
+                  w3_s: torch.Tensor):
+
+    def fuse(a: torch.Tensor, b: torch.Tensor):
+        ab = torch.cat((a, b)).contiguous()
+        _ab = torch.zeros_like(ab)
+        _tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0))
+        return _ab.view(a.size(0), -1)
+
+    w13_qw = fuse(w1_qw, w3_qw)
+    w13_qz = fuse(w1_qz, w3_qz)
+
+    w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1)
+    w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1)
+
+    return w13_qw, w13_qz, w13_s
+
+
+def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor,
+               group_size: int):
+    assert qw.is_contiguous()
+    assert qz.is_contiguous()
+    assert s.is_contiguous()
+    _qw = torch.zeros_like(qw)
+    _sz = torch.zeros_like(s, dtype=torch.int32)  # half2
+    _ws = torch.zeros_like(s)
+    _tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz,
+                        qw.size(-1) * 8, qw.size(0), group_size)
+    return _qw, _sz
+
+
+def tp_m_s4(x: torch.Tensor, tp: int):
+    return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3,
+                                                        1).contiguous()
+
+
+def get_cuda_tensor(tensors):
+    """Get cuda tensor."""
+    result = map(lambda x: x.cuda() if x is not None else x, tensors)
+    return (*result, )
+
+
+@OUTPUT_MODELS.register_module(name='w4')
+class TurbomindW4Model(BaseOutputModel):
+    """Export to turbomind w4a16 format."""
+
+    def __init__(self,
+                 input_model: BaseInputModel,
+                 cfg: TurbomindModelConfig,
+                 to_file: bool = True,
+                 out_dir: str = ''):
+        super().__init__(input_model, cfg, to_file, out_dir)
+
+    def get_config(self, cfg: TurbomindModelConfig):
+        """Get turbomind config."""
+        final_cfg = super().get_config(cfg).__dict__
+
+        # attn_bias, inter_size
+        visit = False
+        attn_bias = 0
+        for bin in self.input_model.bins():
+            for i in range(bin.start_layer_id, bin.end_layer_id):
+                visit = True
+                w1s, _, _ = bin.ffn_scale(i)
+                inter_size = w1s.shape[-1]
+                qb, _, _, _ = bin.attn_bias(i)
+                if qb is not None:
+                    attn_bias = 1
+                break
+            if visit:
+                break
+        final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
+        return TurbomindModelConfig.from_dict(final_cfg)
+
+    def export_transformer_block(self, bin: BaseReader, i: int):
+        """Export transformer layer i."""
+        group_size = self.cfg.group_size
+        tp = self.cfg.tensor_para_size
+        size_per_head = self.cfg.size_per_head
+        # attn
+        q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i))
+        q_qz, k_qz, v_qz, o_qz = get_cuda_tensor(bin.attn_zero(i))
+        q_s, k_s, v_s, o_s = get_cuda_tensor(bin.attn_scale(i))
+
+        q_qw = transpose_qk_s4(q_qw, group_size)
+        k_qw = transpose_qk_s4(k_qw, group_size)
+        q_qz = transpose_qk_s4(q_qz, group_size)
+        k_qz = transpose_qk_s4(k_qz, group_size)
+        q_s = permute(q_s, size_per_head)
+        k_s = permute(k_s, size_per_head)
+
+        qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
+        qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
+        qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)
+
+        qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
+        qkv_qw = tp_m_s4(qkv_qw, tp)
+        self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1)
+        self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1)
+
+        o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
+        self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
+        self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)
+
+        q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
+        if q_b is not None:
+            q_b = permute(q_b, size_per_head)
+            k_b = permute(k_b, size_per_head)
+            qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
+            self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
+            self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True)
+
+        # ffn weights
+        w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i))
+        w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i))
+        w1_s, w2_s, w3_s = get_cuda_tensor(bin.ffn_scale(i))
+
+        w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
+                                              w3_s)
+        w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
+        w13_qw = tp_m_s4(w13_qw, tp)
+        self.save_split(w13_qw, f'layers.{i}.feed_forward.w13.qweight', -1)
+        self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',
+                        -1)
+
+        w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
+        self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
+        self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)
+
+        # norm
+        attn_norm = bin.attn_norm(i)
+        ffn_norm = bin.ffn_norm(i)
+        self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
+        self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')