Unverified Commit 823ad849 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Refactor model conversion (#296)

* split deploy.py

* fix get_cuda_tensor

* deploy qwen_awq

* fix lint

* add docstring

* fix

* support baichuan/baichuan-awq

* parameterizing size_per_head

* remove try/except

* limit input model_format

* add quant_path param

* remove old deploy.py

* fix path

* fix transformer layer range when load bins

* fix qwen init

* split & save log

* relative import

* update get_config

* WeightFileMgr -> Reader

* rename

* update

* fix init_layer_id

* rename llama.py -> meta_llama.py, hf.py -> llama.py

* reduce code

* update arg description

* fix meta llama

* manually cleanup meta model params
parent 1bbc6e05
......@@ -28,8 +28,12 @@ class CLI(object):
model_name (str): The name of the to-be-deployed model, such as
llama-7b, llama-13b, vicuna-7b and etc.
model_path (str): The directory path of the model
model_format (str): The format of the model, fb or hf. 'fb' stands
for META's llama format, and 'hf' means huggingface format.
model_format (str): the format of the model, should choose from
['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
format, 'hf' means huggingface llama format, and 'awq' means
llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
the default value is None, which means the model_format will be
inferred based on model_name
tokenizer_path (str): The path of tokenizer model.
dst_path (str): The destination path that saves outputs.
tp (int): The number of GPUs used for tensor parallelism, which
......@@ -38,7 +42,7 @@ class CLI(object):
group_size (int): A parameter used in AWQ to quantize fp16 weights
to 4 bits.
"""
from lmdeploy.serve.turbomind.deploy import main as convert
from lmdeploy.turbomind.deploy.converter import main as convert
convert(model_name,
model_path,
......
This diff is collapsed.
# Copyright (c) OpenMMLab. All rights reserved.
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
import re
import shutil
from pathlib import Path
import fire
from lmdeploy.model import MODELS
from .source_model.base import INPUT_MODELS
from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig
supported_formats = ['llama', 'hf', 'awq', None]
special_input_model_map = {
'qwen': 'qwen',
'baichuan': 'baichuan',
'baichuan2': 'baichuan2'
}
def get_package_root_path():
"""Get lmdeploy root path."""
import lmdeploy
return Path(lmdeploy.__file__).parent
def get_tokenizer_path(model_path: str, tokenizer_path: str):
"""Get tokenizer path if not given."""
if tokenizer_path is not None:
assert osp.exists(tokenizer_path), f'{tokenizer_path} does not exists.'
return tokenizer_path
candidate = ['tokenizer.model', 'qwen.tiktoken']
for name in candidate:
tmp_path = osp.join(model_path, name)
if osp.exists(tmp_path):
tokenizer_path = tmp_path
break
assert tokenizer_path, 'please supply tokenizer path by --tokenizer-path'
return tokenizer_path
def get_model_format(model_name: str, model_format: str):
"""Get model format if not given or equal awq."""
# get model name prefix
if model_name.find('-') != -1:
model_name = model_name[:model_name.find('-')]
# rules:
# 1) llama -> match special -> hf (if not matched)
# 2) append awq (if model_format is awq)
inferred_model_format = model_format
if model_format in [None, 'hf']:
inferred_model_format = special_input_model_map.get(model_name, 'hf')
elif model_format == 'awq':
inferred_model_format = special_input_model_map.get(model_name,
'hf') + '-awq'
return inferred_model_format
def create_workspace(_path: str):
"""Create a workspace.
Args:
_path (str): the path of the workspace
"""
if osp.exists(_path):
print(f'remove workspace in directory {_path}')
shutil.rmtree(_path)
print(f'create workspace in directory {_path}')
os.makedirs(_path)
def copy_triton_model_templates(_path: str):
"""copy triton model templates to the specified path.
Args:
_path (str): the target path
Returns:
str: the path of the triton models
"""
root = get_package_root_path()
dir_path = osp.join(root, 'serve', 'turbomind')
triton_models_path = osp.join(dir_path, 'triton_models')
dst_path = osp.join(_path, 'triton_models')
print(f'copy triton model templates from "{triton_models_path}" to '
f'"{dst_path}"')
shutil.copytree(triton_models_path, dst_path, symlinks=True)
service_docker_up_file = osp.join(dir_path, 'service_docker_up.sh')
print(f'copy service_docker_up.sh from "{service_docker_up_file}" to '
f'"{_path}"')
shutil.copy(osp.join(dir_path, 'service_docker_up.sh'), _path)
return dst_path
def copy_tokenizer(model_path: str, tokenizer_path: str,
triton_models_path: str):
"""Copy tokenizer."""
shutil.copy(
tokenizer_path,
osp.join(triton_models_path,
osp.join('tokenizer', osp.basename(tokenizer_path))))
for _file in os.listdir(model_path):
if _file.endswith('.json') or _file.endswith('.py'):
json_path = osp.join(model_path, _file)
shutil.copy(json_path,
osp.join(triton_models_path, 'tokenizer', _file))
with get_package_root_path() as root_path:
shutil.copy(osp.join(root_path, 'tokenizer.py'),
osp.join(triton_models_path, 'tokenizer'))
def pack_model_repository(workspace_path: str):
"""package the model repository.
Args:
workspace_path: the path of workspace
"""
os.symlink(src=osp.join('..', '..', 'tokenizer'),
dst=osp.join(workspace_path, 'triton_models', 'preprocessing',
'1', 'tokenizer'))
os.symlink(src=osp.join('..', '..', 'tokenizer'),
dst=osp.join(workspace_path, 'triton_models', 'postprocessing',
'1', 'tokenizer'))
os.symlink(src=osp.join('..', '..', 'weights'),
dst=osp.join(workspace_path, 'triton_models', 'interactive',
'1', 'weights'))
model_repo_dir = osp.join(workspace_path, 'model_repository')
os.makedirs(model_repo_dir, exist_ok=True)
os.symlink(src=osp.join('..', 'triton_models', 'interactive'),
dst=osp.join(model_repo_dir, 'turbomind'))
os.symlink(src=osp.join('..', 'triton_models', 'preprocessing'),
dst=osp.join(model_repo_dir, 'preprocessing'))
os.symlink(src=osp.join('..', 'triton_models', 'postprocessing'),
dst=osp.join(model_repo_dir, 'postprocessing'))
def main(model_name: str,
model_path: str,
model_format: str = None,
tokenizer_path: str = None,
dst_path: str = 'workspace',
tp: int = 1,
quant_path: str = None,
group_size: int = 0):
"""deploy llama family models via turbomind.
Args:
model_name (str): the name of the to-be-deployed model, such as
llama-7b, llama-13b, vicuna-7b and etc
model_path (str): the directory path of the model
model_format (str): the format of the model, should choose from
['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
format, 'hf' means huggingface llama format, and 'awq' means
llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
the default value is None, which means the model_format will be
inferred based on model_name
tokenizer_path (str): the path of tokenizer model
dst_path (str): the destination path that saves outputs
tp (int): the number of GPUs used for tensor parallelism, should be 2^n
quant_path (str): Path of the quantized model, which can be None.
group_size (int): a parameter used in AWQ to quantize fp16 weights
to 4 bits
"""
assert model_name in MODELS.module_dict.keys(), \
f"'{model_name}' is not supported. " \
f'The supported models are: {MODELS.module_dict.keys()}'
assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
output_format = 'fp16'
# get input model format
assert model_format in supported_formats, 'the model format ' \
f'should be in {supported_formats}'
inferred_model_format = get_model_format(model_name, model_format)
if inferred_model_format not in INPUT_MODELS.module_dict.keys():
supported_keys = list(INPUT_MODELS.module_dict.keys())
print(f'with model name {model_name} and model formst {model_format}, '
f'the inferred model format is {inferred_model_format}, '
f'which is not in supported list {supported_keys}')
exit(-1)
# get tokenizer path
tokenizer_path = get_tokenizer_path(model_path, tokenizer_path)
# create workspace
create_workspace(dst_path)
triton_models_path = copy_triton_model_templates(dst_path)
copy_tokenizer(model_path, tokenizer_path, triton_models_path)
# turbomind config
cfg = TurbomindModelConfig.from_dict({}, allow_none=True)
cfg.model_name = model_name
cfg.tensor_para_size = tp
cfg.rotary_embedding = cfg.size_per_head
cfg.group_size = group_size
if inferred_model_format.find('awq') != -1:
cfg.weight_type = 'int4'
output_format = 'w4'
assert group_size > 0, 'group_size should > 0'
# convert
print('model_name ', model_name)
print('model_format ', model_format)
print('inferred_model_format ', inferred_model_format)
print('model_path ', model_path)
print('tokenizer_path ', tokenizer_path)
print('output_format ', output_format)
weight_path = osp.join(triton_models_path, 'weights')
input_model = INPUT_MODELS.get(inferred_model_format)(
model_path=model_path,
tokenizer_path=tokenizer_path,
ckpt_path=quant_path)
output_model = OUTPUT_MODELS.get(output_format)(input_model=input_model,
cfg=cfg,
to_file=True,
out_dir=weight_path)
output_model.export()
# update `tensor_para_size` in `triton_models/interactive/config.pbtxt`
with open(osp.join(triton_models_path, 'interactive', 'config.pbtxt'),
'a') as f:
param = \
'parameters {\n key: "tensor_para_size"\n value: {\n ' \
'string_value: ' + f'"{tp}"\n' + ' }\n}\n' + \
'parameters {\n key: "model_name"\n value: {\n ' \
'string_value: ' + f'"{model_name}"\n' + ' }\n}\n'
f.write(param)
# pack model repository for triton inference server
pack_model_repository(dst_path)
# update the value of $TP in `service_docker_up.sh`
file_path = osp.join(dst_path, 'service_docker_up.sh')
with open(file_path, 'r') as f:
content = f.read()
content = re.sub('TP=1', f'TP={tp}', content)
with open(file_path, 'w') as f:
f.write(content)
if __name__ == '__main__':
fire.Fire(main)
# Copyright (c) OpenMMLab. All rights reserved.
from .baichuan import Baichuan2Model, BaichuanModel # noqa: F401
from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel # noqa: F401
from .llama import LlamaModel # noqa: F401
from .llama_awq import LlamaAwqModel # noqa: F401
from .meta_llama import MetaLlamaModel # noqa: F401
from .qwen import QwenModel # noqa: F401
from .qwen_awq import QwenAwqModel # noqa: F401
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from .base import INPUT_MODELS
from .llama import LlamaModel, LlamaReader
class BaichuanReader(LlamaReader):
"""BaichuanReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
"""Get q, k, v, o kind for layer i."""
result = []
pack_key = f'model.layers.{i}.self_attn.W_pack.{kind}'
qkv = self.params[pack_key]
result.extend(torch.split(qkv, qkv.shape[size_dim] // 3, dim=dim))
o = self.params[f'model.layers.{i}.self_attn.o_proj.{kind}']
result.append(o)
return (*result, )
def attn(self, i: int):
"""Get q, k, v, o weight for layer i."""
return self._attn(i, 'weight', 0, 0)
def attn_bias(self, i: int):
"""Get q, k, v, o bias for layer i."""
return (None, ) * 4
class Baichuan2Reader(BaichuanReader):
"""Baichuan2Reader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def output_weight(self):
"""Get output."""
# https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
tensor = self.params.get('lm_head.weight', None)
if tensor is not None:
tensor = tensor.cuda()
tensor = torch.nn.functional.normalize(tensor)
return tensor
@INPUT_MODELS.register_module(name='baichuan')
class BaichuanModel(LlamaModel):
"""Llama model in baichuan format."""
Reader = BaichuanReader
def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
super().__init__(model_path, tokenizer_path, **kwargs)
@INPUT_MODELS.register_module(name='baichuan2')
class Baichuan2Model(LlamaModel):
"""Llama model in baichuan format."""
Reader = Baichuan2Reader
def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
super().__init__(model_path, tokenizer_path, **kwargs)
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from .baichuan import Baichuan2Model, BaichuanModel, BaichuanReader
from .base import INPUT_MODELS
from .llama_awq import ensure_fp16orint32
class BaichuanAwqReader(BaichuanReader):
"""BaichuanAwqReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def attn(self, i: int):
"""Get q, k, v, o qweight for layer i."""
return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
def attn_zero(self, i: int):
"""Get q, k, v, o qzeros for layer i."""
return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
def attn_scale(self, i: int):
"""Get q, k, v, o scales for layer i."""
return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
def ffn(self, i: int):
"""Get ffn qweight for layer i."""
return ensure_fp16orint32(self._ffn(i, 'qweight'))
def ffn_zero(self, i: int):
"""Get ffn qzeros for layer i."""
return ensure_fp16orint32(self._ffn(i, 'qzeros'))
def ffn_scale(self, i: int):
"""Get ffn scales for layer i."""
return ensure_fp16orint32(self._ffn(i, 'scales'))
class Baichuan2AwqReader(BaichuanAwqReader):
"""Baichuan2AwqReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def output_weight(self):
"""Get output."""
# https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/main/modeling_baichuan.py#L507
tensor = self.params.get('lm_head.weight', None)
if tensor is not None:
tensor = tensor.cuda()
tensor = torch.nn.functional.normalize(tensor)
return tensor
@INPUT_MODELS.register_module(name='baichuan-awq')
class BaichuanAwqModel(BaichuanModel):
"""Baichuan awq model in hf format."""
Reader = BaichuanAwqReader
def __init__(self,
model_path: str,
tokenizer_path: str,
ckpt_path: str = None,
**kwargs):
super().__init__(model_path,
tokenizer_path,
ckpt_path=ckpt_path,
**kwargs)
@INPUT_MODELS.register_module(name='baichuan2-awq')
class Baichuan2AwqModel(Baichuan2Model):
"""Baichuan2 awq model in hf format."""
Reader = Baichuan2AwqReader
def __init__(self,
model_path: str,
tokenizer_path: str,
ckpt_path: str = None,
**kwargs):
super().__init__(model_path,
tokenizer_path,
ckpt_path=ckpt_path,
**kwargs)
# Copyright (c) OpenMMLab. All rights reserved.
import re
from abc import ABC, abstractmethod
from typing import Dict, Iterator, Tuple, Union
import torch
from mmengine import Registry
INPUT_MODELS = Registry(
'source model', locations=['lmdeploy.turbomind.deploy.source_model.base'])
class BaseReader(ABC):
"""Base checkpoint manager."""
def __init__(self):
pass
@property
@abstractmethod
def start_layer_id(self) -> int:
"""Get the start transformer layer number."""
pass
@property
@abstractmethod
def end_layer_id(self) -> int:
"""Get the end transformer layer number."""
pass
@abstractmethod
def init_layer_id(self) -> None:
"""Get start and end transformer layer number."""
self._start_layer_id = -1
self._end_layer_id = -1
layer_count = {}
for key in self.params:
layer_id = re.findall(self.attn_layer_patten, key)
if len(layer_id) == 0:
continue
layer_id = int(layer_id[0])
if layer_id not in layer_count:
layer_count[layer_id] = 0
layer_count[layer_id] += 1
if len(layer_count) == 0:
return
if not (len(layer_count) > 1 or self.last_bin):
return
max_count = max([layer_count[layer_id] for layer_id in layer_count])
valid_layer_id = [
layer_id for layer_id in layer_count
if layer_count[layer_id] == max_count
]
self._start_layer_id = min(valid_layer_id)
self._end_layer_id = max(valid_layer_id) + 1
@abstractmethod
def clean_up(self, last: bool) -> None:
"""Clean up unused params."""
if last:
self.params.clear()
else:
to_remove = []
for key in self.params:
layer_id = re.findall(self.attn_layer_patten, key)
if len(layer_id) == 0:
to_remove.append(key)
else:
layer_id = int(layer_id[0])
if layer_id < self.end_layer_id:
to_remove.append(key)
for key in to_remove:
self.params.pop(key, None)
torch.cuda.empty_cache()
@abstractmethod
def tok_embeddings(self) -> Union[torch.Tensor, None]:
"""Get embeddings."""
pass
@abstractmethod
def norm_weight(self) -> Union[torch.Tensor, None]:
"""Get norm."""
pass
@abstractmethod
def output_weight(self) -> Union[torch.Tensor, None]:
"""Get output."""
pass
@abstractmethod
def attn(self, i: int) -> Tuple[torch.Tensor]:
"""Get q, k, v, o weight for layer i."""
pass
@abstractmethod
def attn_bias(self, i: int) -> Tuple[torch.Tensor, None]:
"""Get q, k, v, o bias for layer i."""
pass
@abstractmethod
def attn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
"""Get q, k, v, o zero point for layer i."""
pass
@abstractmethod
def attn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
"""Get q, k, v, o scale for layer i."""
pass
@abstractmethod
def attn_norm(self, i: int) -> torch.Tensor:
"""Get attn norm for layer i."""
pass
@abstractmethod
def ffn(self, i: int) -> Tuple[torch.Tensor]:
"""Get ffn weight for layer i."""
pass
@abstractmethod
def ffn_zero(self, i: int) -> Tuple[torch.Tensor, None]:
"""Get ffn zero point for layer i."""
pass
@abstractmethod
def ffn_scale(self, i: int) -> Tuple[torch.Tensor, None]:
"""Get ffn scale for layer i."""
pass
@abstractmethod
def ffn_norm(self, i: int) -> torch.Tensor:
"""Get ffn norm for layer i."""
pass
class BaseInputModel(ABC):
"""Base class for input model."""
def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
"""Constructor for BaseInputModel.
Args:
model_path (str): the path of the model.
tokenizer_path (str): the path of the tokenizer model.
"""
self.model_path = model_path
self.tokenizer_path = tokenizer_path
@property
@abstractmethod
def nmgrs(self) -> int:
"""Get number of checkpoint."""
pass
@abstractmethod
def get_mgrs(self) -> Iterator[BaseReader]:
"""Conctruct all BaseReader."""
pass
@abstractmethod
def tokenizer_info(self):
"""Read tokenizer info."""
pass
@abstractmethod
def model_info(self) -> Dict:
"""Read model info."""
pass
def bins(self) -> Iterator[BaseReader]:
"""Get Reader."""
for mgr in self.get_mgrs():
yield mgr
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os
import os.path as osp
import torch
from safetensors.torch import load_file
from sentencepiece import SentencePieceProcessor
from lmdeploy.tokenizer import Tokenizer
from .base import INPUT_MODELS, BaseInputModel, BaseReader
class LlamaReader(BaseReader):
"""LlamaReader."""
attn_layer_patten = r'model.layers.([0-9]+).'
tok_embeddings_key = 'model.embed_tokens.weight'
norm_weight_key = 'model.norm.weight'
output_weight_key = 'lm_head.weight'
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__()
self.params = unused_params
self.params.update(new_params)
self.last_bin = last_bin
self.init_layer_id()
def init_layer_id(self):
"""Get start/end transformer layer id."""
super().init_layer_id()
def clean_up(self, last: bool) -> None:
"""Clean up unused params."""
super().clean_up(last)
@property
def start_layer_id(self):
"""Get start transformer layer id."""
return self._start_layer_id
@property
def end_layer_id(self):
"""Get end transformer layer id."""
return self._end_layer_id
def tok_embeddings(self):
"""Get embeddings."""
return self.params.get(self.tok_embeddings_key, None)
def norm_weight(self):
"""Get norm."""
return self.params.get(self.norm_weight_key, None)
def output_weight(self):
"""Get output."""
return self.params.get(self.output_weight_key, None)
def _attn(self, i: int, kind: str, allow_none=False):
"""Get q, k, v, o kind for layer i."""
result = []
for key in ['q', 'k', 'v', 'o']:
tensor = self.params.get(
f'model.layers.{i}.self_attn.{key}_proj.{kind}')
if not allow_none:
assert tensor is not None
result.append(tensor)
return (*result, )
def attn(self, i: int):
"""Get q, k, v, o weight for layer i."""
return self._attn(i, 'weight')
def attn_bias(self, i: int):
"""Get q, k, v, o bias for layer i."""
return self._attn(i, 'bias', allow_none=True)
def attn_zero(self, i: int):
"""Get q, k, v, o zero point for layer i."""
return (None, ) * 4
def attn_scale(self, i: int):
"""Get q, k, v, o scale for layer i."""
return (None, ) * 4
def attn_norm(self, i: int):
"""Get attn norm for layer i."""
return self.params[f'model.layers.{i}.input_layernorm.weight']
def _ffn(self, i: int, kind: str):
"""Get ffn kind for layer i."""
result = []
for key in ['gate', 'down', 'up']:
tensor = self.params[f'model.layers.{i}.mlp.{key}_proj.{kind}']
result.append(tensor)
return (*result, )
def ffn(self, i: int):
"""Get ffn weight for layer i."""
return self._ffn(i, 'weight')
def ffn_zero(self, i: int):
"""Get ffn zero point for layer i."""
return (None, ) * 3
def ffn_scale(self, i: int):
"""Get ffn scale for layer i."""
return (None, ) * 3
def ffn_norm(self, i: int):
"""Get ffn norm for layer i."""
return self.params[f'model.layers.{i}.post_attention_layernorm.weight']
@INPUT_MODELS.register_module(name='hf')
class LlamaModel(BaseInputModel):
"""Llama model in hf format."""
Reader = LlamaReader
def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
super().__init__(model_path, tokenizer_path)
ckpt_path = kwargs.get('ckpt_path')
if ckpt_path is None:
ckpt_path = model_path
self.ckpt_path = ckpt_path
self.ckpt_files = self.get_ckpt()
def get_ckpt(self):
"""Get weight files."""
suffixes = ['.safetensors', '.bin']
files = []
for suffix in suffixes:
files = [
file for file in os.listdir(self.ckpt_path)
if file.endswith(suffix)
]
if len(files) > 0:
break
files = sorted(files)
return files
@property
def nmgrs(self):
"""Get number of checkpoint."""
return len(self.ckpt_files)
def get_mgrs(self):
"""Conctruct all Reader."""
assert self.nmgrs > 0, \
f'could not find checkpoints in {self.ckpt_path}'
unused_params = {}
try:
for i, ckpt in enumerate(self.ckpt_files):
is_last_bin = i == len(self.ckpt_files) - 1
if ckpt.endswith('.bin'):
new_params = torch.load(osp.join(self.ckpt_path, ckpt),
map_location='cpu')
else:
new_params = load_file(osp.join(self.ckpt_path, ckpt))
ret = self.Reader(new_params, unused_params,
i == self.nmgrs - 1)
yield ret
ret.clean_up(is_last_bin)
except GeneratorExit:
ret.clean_up(True)
def tokenizer_info(self):
"""Read tokenizer info."""
assert osp.isfile(self.tokenizer_path), self.tokenizer_path
try:
tk_model = SentencePieceProcessor(model_file=self.tokenizer_path)
# BOS / EOS token IDs
n_words = tk_model.vocab_size
bos_id = tk_model.bos_token_id
eos_id = tk_model.eos_token_id
except Exception:
tk_model = Tokenizer(self.model_path)
n_words = tk_model.vocab_size
bos_id = tk_model.bos_token_id
eos_id = tk_model.eos_token_id
return n_words, bos_id, eos_id
def model_info(self):
"""Read model info."""
params_path = osp.join(self.model_path, 'config.json')
with open(params_path) as f:
model_arg = json.load(f)
num_layer = model_arg['num_hidden_layers']
norm_eps = model_arg['rms_norm_eps']
if 'num_key_value_heads' in model_arg:
kv_head_num = model_arg['num_key_value_heads']
else:
kv_head_num = model_arg['num_attention_heads']
rope_theta = float(model_arg.get('rope_theta', 10000.0))
max_position_embeddings = int(
model_arg.get('max_position_embeddings', 0))
repo_scaling = bool(model_arg.get('rope_scaling', False))
return dict(num_layer=num_layer,
norm_eps=norm_eps,
kv_head_num=kv_head_num,
rope_theta=rope_theta,
max_position_embeddings=max_position_embeddings,
use_dynamic_ntk=int(repo_scaling))
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from .base import INPUT_MODELS
from .llama import LlamaModel, LlamaReader
def ensure_fp16orint32(tensors: torch.Tensor):
"""Ensure tensors in fp16/int32 format."""
result = []
for tensor in tensors:
if tensor is not None:
if tensor.dtype in [torch.float16, torch.float32, torch.bfloat16]:
result.append(tensor.half())
else:
assert tensor.dtype == torch.int32
result.append(tensor)
else:
result.append(None)
return (*result, )
class LlamaAwqReader(LlamaReader):
"""LlamaAwqReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def attn(self, i: int):
"""Get q, k, v, o qweight for layer i."""
return ensure_fp16orint32(self._attn(i, 'qweight'))
def attn_zero(self, i: int):
"""Get q, k, v, o qzeros for layer i."""
return ensure_fp16orint32(self._attn(i, 'qzeros'))
def attn_scale(self, i: int):
"""Get q, k, v, o scales for layer i."""
return ensure_fp16orint32(self._attn(i, 'scales'))
def ffn(self, i: int):
"""Get ffn qweight for layer i."""
return ensure_fp16orint32(self._ffn(i, 'qweight'))
def ffn_zero(self, i: int):
"""Get ffn qzeros for layer i."""
return ensure_fp16orint32(self._ffn(i, 'qzeros'))
def ffn_scale(self, i: int):
"""Get ffn scales for layer i."""
return ensure_fp16orint32(self._ffn(i, 'scales'))
@INPUT_MODELS.register_module(name='hf-awq')
class LlamaAwqModel(LlamaModel):
"""Llama Awq model in hf format."""
Reader = LlamaAwqReader
def __init__(self,
model_path: str,
tokenizer_path: str,
ckpt_path: str = None,
**kwargs):
super().__init__(model_path,
tokenizer_path,
ckpt_path=ckpt_path,
**kwargs)
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os.path as osp
from pathlib import Path
import torch
from sentencepiece import SentencePieceProcessor
from .base import INPUT_MODELS, BaseInputModel, BaseReader
def reverse_permute(x: torch.Tensor, size_per_head: int = 128):
"""reverse permute to hf format."""
if x.shape[-1] > 1:
dim = x.shape[-1]
n_heads = dim // size_per_head
return x.view(-1, n_heads, dim // n_heads // 2,
2).transpose(2, 3).reshape(-1, dim)
else: # scales, zeros
dim = x.shape[0]
n_heads = dim // size_per_head
return x.view(n_heads, dim // n_heads // 2, 2,
1).transpose(1, 2).reshape(dim, 1)
class MetaLlamaReader(BaseReader):
"""MetaLlamaReader."""
def __init__(self, model_path: str, start_layer_id: int,
end_layer_id: int):
super().__init__()
self._start_layer_id = start_layer_id
self._end_layer_id = end_layer_id
self.params = self.load_model(model_path)
def init_layer_id(self):
"""Empty."""
pass
def load_model(self, model_path):
"""Load all parameters."""
checkpoints = []
for pattern in ['*.pth', '*.pt']:
checkpoints += sorted(Path(model_path).glob(pattern))
n_ckpt = len(checkpoints)
model_params = {}
def get_param(_name, _size):
if _name not in model_params:
model_params[_name] = torch.zeros(_size,
dtype=torch.float16,
device='cpu')
return model_params[_name]
from tqdm import tqdm
pbar = tqdm(total=n_ckpt, desc='load meta ckpt', leave=False)
for i, ckpt_path in enumerate(checkpoints):
ckpt = torch.load(ckpt_path, map_location='cpu')
for param_name, param_data in ckpt.items():
key, ext = param_name.split('.')[-2:]
# column-parallel
if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:
size = param_data.size(0)
if ext == 'weight':
param = get_param(
param_name,
[size * n_ckpt, param_data.size(1)])
param.data[size * i:size * (i + 1), :] = param_data
else: # bias
param = get_param(param_name, [size * n_ckpt])
param.data[size * i:size * (i + 1)] = param_data
# row-parallel
elif key in ['w2', 'wo', 'tok_embeddings']:
size = param_data.size(-1)
if ext == 'weight':
param = get_param(param_name,
[param_data.size(0), size * n_ckpt])
param.data[:, size * i:size * (i + 1)] = param_data
else: # bias
param = get_param(param_name, [size])
param.data = param_data
elif i == 0:
param = get_param(param_name, param_data.size())
param.data = param_data
del ckpt
pbar.update(1)
pbar.close()
for name, param in model_params.items():
# transpose all weights as TurboMind is expecting column-major
# (output_dims, input_dims) -> (input_dims, output_dims)
key = name.split('.')[-2]
if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
param.data = param.data.t()
if key in ['wq', 'wk']:
param.data = reverse_permute(param.data)
return model_params
def clean_up(self, last: bool) -> None:
"""Clean up unused params."""
self.params.clear()
@property
def start_layer_id(self):
"""Get start transformer layer id."""
return self._start_layer_id
@property
def end_layer_id(self):
"""Get end transformer layer id."""
return self._end_layer_id
def tok_embeddings(self):
"""Get embeddings."""
return self.params.get('tok_embeddings.weight')
def norm_weight(self):
"""Get norm."""
return self.params.get('norm.weight')
def output_weight(self):
"""Get output."""
return self.params.get('output.weight')
def attn(self, i: int):
"""Get q, k, v, o weight for layer i."""
result = []
for key in ['wq', 'wk', 'wv', 'wo']:
tensor = self.params[f'layers.{i}.attention.{key}.weight']
tensor = tensor.t() if tensor is not None else None
result.append(tensor)
return (*result, )
def attn_bias(self, i: int):
"""Get q, k, v, o bias for layer i."""
result = []
for key in ['wq', 'wk', 'wv', 'wo']:
tensor = self.params.get(f'layers.{i}.attention.{key}.bias')
tensor = tensor.t() if tensor is not None else None
result.append(tensor)
return (*result, )
def attn_zero(self, i: int):
"""Get q, k, v, o zero point for layer i."""
return (None, ) * 4
def attn_scale(self, i: int):
"""Get q, k, v, o scale for layer i."""
return (None, ) * 4
def attn_norm(self, i: int):
"""Get attn norm for layer i."""
return self.params[f'layers.{i}.attention_norm.weight']
def ffn(self, i: int):
"""Get ffn weight for layer i."""
result = []
for key in ['w1', 'w2', 'w3']:
tensor = self.params[f'layers.{i}.feed_forward.{key}.weight']
result.append(tensor.t())
return (*result, )
def ffn_zero(self, i: int):
"""Get ffn zero point for layer i."""
return (None, ) * 3
def ffn_scale(self, i: int):
"""Get ffn scale for layer i."""
return (None, ) * 3
def ffn_norm(self, i: int):
"""Get ffn norm for layer i."""
return self.params[f'layers.{i}.ffn_norm.weight']
@INPUT_MODELS.register_module(name='llama')
class MetaLlamaModel(BaseInputModel):
"""Llama model in fb format."""
def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
super().__init__(model_path, tokenizer_path, **kwargs)
@property
def nmgrs(self):
"""Get number of checkpoint."""
return 1
def get_mgrs(self):
"""Conctruct all BaseReader."""
end_layer_id = self.model_info()['num_layer']
try:
if hasattr(self, 'meta_reader'):
yield self.meta_reader
else:
self.meta_reader = MetaLlamaReader(self.model_path, 0,
end_layer_id)
yield self.meta_reader
except GeneratorExit:
pass
def tokenizer_info(self):
"""Read tokenizer info."""
assert osp.isfile(self.tokenizer_path), self.tokenizer_path
sp_model = SentencePieceProcessor(model_file=self.tokenizer_path)
# BOS / EOS token IDs
n_words = sp_model.vocab_size()
bos_id = sp_model.bos_id()
eos_id = sp_model.eos_id()
return n_words, bos_id, eos_id
def model_info(self):
"""Read model info."""
params_path = osp.join(self.model_path, 'params.json')
with open(params_path) as f:
model_arg = json.load(f)
num_layer = model_arg['n_layers']
norm_eps = model_arg['norm_eps']
head_num = model_arg.get('n_heads', 32)
kv_head_num = model_arg.get('n_kv_heads', head_num)
return dict(num_layer=num_layer,
norm_eps=norm_eps,
head_num=head_num,
kv_head_num=kv_head_num)
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os.path as osp
import torch
from .base import INPUT_MODELS
from .llama import LlamaModel, LlamaReader
class QwenReader(LlamaReader):
"""QwenReader."""
attn_layer_patten = r'transformer.h.([0-9]+).'
tok_embeddings_key = 'transformer.wte.weight'
norm_weight_key = 'transformer.ln_f.weight'
output_weight_key = 'lm_head.weight'
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
"""Get q, k, v, o kind for layer i."""
qkv = self.params[f'transformer.h.{i}.attn.c_attn.{kind}']
q, k, v = torch.split(qkv, qkv.size(size_dim) // 3, dim=dim)
o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}', None)
if o is None:
o = torch.zeros_like(q)
return q, k, v, o
def attn(self, i: int):
"""Get q, k, v, o weight for layer i."""
return self._attn(i, 'weight', 0, 0)
def attn_bias(self, i: int):
"""Get q, k, v, o bias for layer i."""
return self._attn(i, 'bias', -1, 0)
def attn_zero(self, i: int):
"""Get q, k, v, o zero point for layer i."""
return (None, ) * 4
def attn_scale(self, i: int):
"""Get q, k, v, o scale for layer i."""
return (None, ) * 4
def attn_norm(self, i: int):
"""Get attn norm for layer i."""
return self.params[f'transformer.h.{i}.ln_1.weight']
def _ffn(self, i: int, kind: str):
"""Get ffn kind for layer i."""
result = []
for key in ['w2', 'c_proj', 'w1']:
tensor = self.params[f'transformer.h.{i}.mlp.{key}.{kind}']
result.append(tensor)
return (*result, )
def ffn(self, i: int):
"""Get ffn weight for layer i."""
return self._ffn(i, 'weight')
def ffn_zero(self, i: int):
"""Get ffn zero point for layer i."""
return (None, ) * 3
def ffn_scale(self, i: int):
"""Get ffn scale for layer i."""
return (None, ) * 3
def ffn_norm(self, i: int):
"""Get ffn norm for layer i."""
return self.params[f'transformer.h.{i}.ln_2.weight']
@INPUT_MODELS.register_module(name='qwen')
class QwenModel(LlamaModel):
"""Qwen model in hf format."""
Reader = QwenReader
def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
super().__init__(model_path, tokenizer_path, **kwargs)
def tokenizer_info(self):
"""Read tokenizer info."""
n_words = 151851
bos_id = 0
eos_id = 151643
return n_words, bos_id, eos_id
def model_info(self):
"""Read model info."""
params_path = osp.join(self.model_path, 'config.json')
with open(params_path) as f:
config = json.load(f)
num_layer = config['num_hidden_layers']
norm_eps = config['layer_norm_epsilon']
rope_theta = float(config.get('rotary_emb_base', 10000.0))
if 'num_key_value_heads' in config:
kv_head_num = config['num_key_value_heads']
else:
kv_head_num = config['num_attention_heads']
seq_length = config['seq_length']
use_dynamic_ntk = int(config['use_dynamic_ntk'])
use_logn_attn = int(config['use_logn_attn'])
return dict(num_layer=num_layer,
norm_eps=norm_eps,
kv_head_num=kv_head_num,
rope_theta=rope_theta,
max_position_embeddings=seq_length,
use_dynamic_ntk=int(use_dynamic_ntk),
use_logn_attn=use_logn_attn)
# Copyright (c) OpenMMLab. All rights reserved.
from .base import INPUT_MODELS
from .llama_awq import ensure_fp16orint32
from .qwen import QwenModel, QwenReader
class QwenAwqReader(QwenReader):
"""QwenAwqReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def attn(self, i: int):
"""Get q, k, v, o qweight for layer i."""
return ensure_fp16orint32(self._attn(i, 'qweight', -1, -1))
def attn_bias(self, i: int):
"""Get q, k, v, o bias for layer i."""
return ensure_fp16orint32(self._attn(i, 'bias', -1, 0))
def attn_zero(self, i: int):
"""Get q, k, v, o qzeros for layer i."""
return ensure_fp16orint32(self._attn(i, 'qzeros', -1, -1))
def attn_scale(self, i: int):
"""Get q, k, v, o scales for layer i."""
return ensure_fp16orint32(self._attn(i, 'scales', -1, -1))
def ffn(self, i: int):
"""Get ffn qweight for layer i."""
# ours: w2(silu(w1(x)) * w3(x))
# qwen: c_proj(w1(x) * silu(w2(x)))
return ensure_fp16orint32(self._ffn(i, 'qweight'))
def ffn_zero(self, i: int):
"""Get ffn qzeros for layer i."""
return ensure_fp16orint32(self._ffn(i, 'qzeros'))
def ffn_scale(self, i: int):
"""Get ffn scales for layer i."""
return ensure_fp16orint32(self._ffn(i, 'scales'))
@INPUT_MODELS.register_module(name='qwen-awq')
class QwenAwqModel(QwenModel):
"""Qwen awq model in hf format."""
Reader = QwenAwqReader
def __init__(self,
model_path: str,
tokenizer_path: str,
ckpt_path: str = None,
**kwargs):
super().__init__(model_path,
tokenizer_path,
ckpt_path=ckpt_path,
**kwargs)
# Copyright (c) OpenMMLab. All rights reserved.
from .fp import TurbomindModel # noqa: F401
from .w4 import TurbomindW4Model # noqa: F401
# Copyright (c) OpenMMLab. All rights reserved.
import configparser
import inspect
import os.path as osp
from abc import ABC, abstractmethod
from dataclasses import dataclass
import torch
import tqdm
from mmengine import Registry
from lmdeploy.model import MODELS
from ..source_model.base import BaseInputModel, BaseReader
OUTPUT_MODELS = Registry(
'target model', locations=['lmdeploy.turbomind.deploy.target_model.base'])
def tprint(*args, **kwargs):
from io import StringIO
s = StringIO()
print(*args, **kwargs, file=s, end='')
tqdm.tqdm.write(s.getvalue())
@dataclass
class TurbomindModelConfig:
"""Config for turbomind model."""
model_name: str
tensor_para_size: int
head_num: int
kv_head_num: int
vocab_size: int
num_layer: int
inter_size: int
norm_eps: float
attn_bias: int
start_id: int
end_id: int
session_len: int
weight_type: str = 'fp16'
rotary_embedding: int = 128
rope_theta: float = 10000.0
size_per_head: int = 128
group_size: int = 0
max_batch_size: int = 32
max_context_token_num: int = 4
step_length: int = 1
cache_max_entry_count: int = 48
cache_chunk_size: int = 1
use_context_fmha: int = 1
quant_policy: int = 0
max_position_embeddings: int = 0
use_dynamic_ntk: int = 0
use_logn_attn: int = 0
@classmethod
def from_dict(cls, env, allow_none=False):
"""Construct from dict."""
params = inspect.signature(cls).parameters
used = {k: v for k, v in env.items() if k in params and v is not None}
if not allow_none:
return cls(**used)
else:
default = {
k: None
for k in params.keys() if params[k].default is inspect._empty
}
default.update(used)
return cls(**default)
@property
def valid(self):
"""Check if cfg is valid."""
for _, v in self.__dict__.items():
if v is None:
return False
return True
class BaseOutputModel(ABC):
"""Base output model."""
def __init__(self,
input_model: BaseInputModel,
cfg: TurbomindModelConfig,
to_file: bool = True,
out_dir: str = ''):
super().__init__()
self.input_model = input_model
self.cfg = self.get_config(cfg)
assert self.cfg.valid
self.to_file = to_file
self.out_dir = out_dir
@abstractmethod
def get_config(self, cfg: TurbomindModelConfig) -> TurbomindModelConfig:
"""Generate turbomind model config (config.ini)."""
_, bos_id, eos_id = self.input_model.tokenizer_info()
model = MODELS.get(cfg.model_name)()
final_cfg = cfg.__dict__
final_cfg.update(
dict(start_id=bos_id,
end_id=eos_id,
session_len=model.session_len + 8))
final_cfg.update(self.input_model.model_info())
# head_num, vocab_size
for bin in self.input_model.bins():
emb = bin.tok_embeddings()
if emb is not None:
_vocab_size, dim = emb.shape
head_num = dim // cfg.size_per_head
break
final_cfg.update(dict(head_num=head_num, vocab_size=_vocab_size))
return TurbomindModelConfig.from_dict(final_cfg, allow_none=True)
def export_config(self) -> None:
"""export turbomind config."""
if self.to_file:
config = configparser.ConfigParser()
cfg = dict(llama=self.cfg.__dict__)
for section, key_values in cfg.items():
config[section] = key_values
config_path = osp.join(self.out_dir, 'config.ini')
with open(config_path, 'w') as f:
config.write(f)
def export_weight(self, param: torch.Tensor, name: str) -> None:
"""export turbomind weight."""
if self.to_file:
if param.dtype in [torch.float, torch.bfloat16]:
param = param.half()
tprint(name, param.shape)
param.contiguous().cpu().numpy().tofile(
osp.join(self.out_dir, name))
def save_split(self,
tensor: torch.Tensor,
name: str,
split_dim=None,
copy=False) -> None:
"""save split."""
tp = self.cfg.tensor_para_size
if split_dim is not None:
tprint(f'*** splitting {name}, shape={tensor.shape}, '
f'split_dim={split_dim}, tp={tp}')
assert tensor.shape[split_dim] % tp == 0
split_size = tensor.shape[split_dim] // tp
splits = torch.split(tensor, split_size, dim=split_dim)
for i, split in enumerate(splits):
prefix, ext = osp.splitext(name)
self.export_weight(split, f'{prefix}.{i}{ext}')
elif copy:
tprint(f'### copying {name}, shape={tensor.shape}')
copies = [tensor] * tp
for i, copy in enumerate(copies):
prefix, ext = osp.splitext(name)
self.export_weight(copy, f'{prefix}.{i}{ext}')
else:
self.export_weight(tensor, name)
def export(self) -> None:
"""Export to turbomind model format."""
num_layer = self.cfg.num_layer
from tqdm import tqdm
pbar = tqdm(total=num_layer, desc='Convert to turbomind format')
self.export_config()
for bin in self.input_model.bins():
self.export_misc(bin)
for i in range(bin.start_layer_id, bin.end_layer_id):
self.export_transformer_block(bin, i)
pbar.update(1)
pbar.close()
# manually clean up meta reader
if hasattr(self.input_model, 'meta_reader'):
self.input_model.meta_reader.clean_up(True)
del self.input_model.meta_reader
torch.cuda.empty_cache()
def export_misc(self, bin: BaseReader) -> None:
"""Export embedding, norm, output weight."""
emb = bin.tok_embeddings()
norm_weight = bin.norm_weight()
output_weight = bin.output_weight()
def pad_weight(tensor):
pad_size = None
vocab_size = self.cfg.vocab_size
tp = self.cfg.tensor_para_size
if vocab_size % tp != 0:
pad_size = (vocab_size + tp - 1) // tp * tp - vocab_size
if pad_size is None:
return tensor
return torch.nn.functional.pad(tensor, (0, 0, 0, pad_size),
'constant', 0)
if emb is not None:
emb = pad_weight(emb)
self.export_weight(emb, 'tok_embeddings.weight')
if norm_weight is not None:
self.export_weight(norm_weight, 'norm.weight')
if output_weight is not None:
output_weight = pad_weight(output_weight)
self.export_weight(output_weight, 'output.weight')
@abstractmethod
def export_transformer_block(self, bin: BaseReader, i: int) -> None:
"""Export transformer block."""
pass
def permute(x: torch.Tensor, size_per_head: int = 128):
if x.shape[-1] > 1:
dim = x.shape[-1]
n_heads = dim // size_per_head
return x.view(-1, n_heads, 2,
dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
else: # scales, zeros
dim = x.shape[0]
n_heads = dim // size_per_head
return x.view(n_heads, 2, dim // n_heads // 2,
1).transpose(1, 2).reshape(dim, 1)
def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
dim: int):
def reshape(x):
return x.view(x.size(0), tp, -1) if dim == 2 else x.view(tp, -1)
qkv = torch.cat((reshape(q), reshape(k), reshape(v)), dim=-1)
# (input_dim, head_num + 2 * kv_head_num)
return qkv.view(q.size(0), -1)
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List
import torch
from ..source_model.base import BaseInputModel, BaseReader
from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
merge_qkv, permute)
def transpose_tensor(input: List[torch.Tensor]):
"""Transpose tensor."""
output = [x.cuda().t() for x in input]
return output
@OUTPUT_MODELS.register_module(name='fp16')
class TurbomindModel(BaseOutputModel):
"""Export to turbomind fp16 format."""
def __init__(self,
input_model: BaseInputModel,
cfg: TurbomindModelConfig,
to_file: bool = True,
out_dir: str = ''):
super().__init__(input_model, cfg, to_file, out_dir)
def get_config(self, cfg: TurbomindModelConfig):
"""Get turbomind config."""
final_cfg = super().get_config(cfg).__dict__
# attn_bias, inter_size
visit = False
attn_bias = 0
for bin in self.input_model.bins():
for i in range(bin.start_layer_id, bin.end_layer_id):
visit = True
w1, _, _ = bin.ffn(i)
inter_size = w1.t().shape[-1]
qb, _, _, _ = bin.attn_bias(i)
if qb is not None:
attn_bias = 1
break
if visit:
break
final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
return TurbomindModelConfig.from_dict(final_cfg)
def export_transformer_block(self, bin: BaseReader, i: int):
"""Export transformer layer i."""
assert bin.start_layer_id <= i < bin.end_layer_id
tp = self.cfg.tensor_para_size
size_per_head = self.cfg.size_per_head
# attn
qw, kw, vw, ow = bin.attn(i)
qw, kw, vw, ow = transpose_tensor([qw, kw, vw, ow])
qw = permute(qw, size_per_head)
kw = permute(kw, size_per_head)
qkv_w = merge_qkv(qw, kw, vw, tp, dim=2)
self.save_split(qkv_w, f'layers.{i}.attention.w_qkv.weight', -1)
self.save_split(ow, f'layers.{i}.attention.wo.weight', 0)
qb, kb, vb, ob = bin.attn_bias(i)
if qb is not None:
qb, kb, vb, ob = transpose_tensor([qb, kb, vb, ob])
qb = permute(qb, size_per_head)
kb = permute(kb, size_per_head)
qkv_b = merge_qkv(qb, kb, vb, tp, dim=1)
self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
self.save_split(ob, f'layers.{i}.attention.wo.bias', copy=True)
# ffn
w1, w2, w3 = bin.ffn(i)
w1, w2, w3 = transpose_tensor([w1, w2, w3])
self.save_split(w1, f'layers.{i}.feed_forward.w1.weight', -1)
self.save_split(w3, f'layers.{i}.feed_forward.w3.weight', -1)
self.save_split(w2, f'layers.{i}.feed_forward.w2.weight', 0)
# norm
attn_norm = bin.attn_norm(i)
ffn_norm = bin.ffn_norm(i)
self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import sys
import torch
import lmdeploy
from ..source_model.base import BaseInputModel, BaseReader
from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
merge_qkv, permute)
# import _turbomind as _tm
# TODO: find another way import _turbomind
lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
sys.path.append(osp.join(lmdeploy_dir, 'lib'))
import _turbomind as _tm # noqa: E402
def transpose_qk_s4(src: torch.Tensor, group_size):
assert src.is_contiguous()
dst = torch.zeros_like(src)
_tm.transpose_qk_s4_k_m8(src, dst,
src.size(-1) * 8, src.size(0), group_size)
return dst
def fuse_w1_w3_s4(w1_qw: torch.Tensor, w1_qz: torch.Tensor, w1_s: torch.Tensor,
w3_qw: torch.Tensor, w3_qz: torch.Tensor,
w3_s: torch.Tensor):
def fuse(a: torch.Tensor, b: torch.Tensor):
ab = torch.cat((a, b)).contiguous()
_ab = torch.zeros_like(ab)
_tm.fuse_w1_w3_s4_k_m8(ab, _ab, a.size(-1) * 8, a.size(0))
return _ab.view(a.size(0), -1)
w13_qw = fuse(w1_qw, w3_qw)
w13_qz = fuse(w1_qz, w3_qz)
w13_s = torch.cat((w1_s, w3_s)).view(2, w1_s.size(0), -1)
w13_s = w13_s.permute(1, 2, 0).contiguous().view(w1_s.size(0), -1)
return w13_qw, w13_qz, w13_s
def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor,
group_size: int):
assert qw.is_contiguous()
assert qz.is_contiguous()
assert s.is_contiguous()
_qw = torch.zeros_like(qw)
_sz = torch.zeros_like(s, dtype=torch.int32) # half2
_ws = torch.zeros_like(s)
_tm.convert_s4_k_m8(_qw, _sz, _ws, qw, s, qz,
qw.size(-1) * 8, qw.size(0), group_size)
return _qw, _sz
def tp_m_s4(x: torch.Tensor, tp: int):
return x.view(x.size(0) // 32, tp, -1, 128).permute(0, 2, 3,
1).contiguous()
def get_cuda_tensor(tensors):
"""Get cuda tensor."""
result = map(lambda x: x.cuda() if x is not None else x, tensors)
return (*result, )
@OUTPUT_MODELS.register_module(name='w4')
class TurbomindW4Model(BaseOutputModel):
"""Export to turbomind w4a16 format."""
def __init__(self,
input_model: BaseInputModel,
cfg: TurbomindModelConfig,
to_file: bool = True,
out_dir: str = ''):
super().__init__(input_model, cfg, to_file, out_dir)
def get_config(self, cfg: TurbomindModelConfig):
"""Get turbomind config."""
final_cfg = super().get_config(cfg).__dict__
# attn_bias, inter_size
visit = False
attn_bias = 0
for bin in self.input_model.bins():
for i in range(bin.start_layer_id, bin.end_layer_id):
visit = True
w1s, _, _ = bin.ffn_scale(i)
inter_size = w1s.shape[-1]
qb, _, _, _ = bin.attn_bias(i)
if qb is not None:
attn_bias = 1
break
if visit:
break
final_cfg.update(dict(attn_bias=attn_bias, inter_size=inter_size))
return TurbomindModelConfig.from_dict(final_cfg)
def export_transformer_block(self, bin: BaseReader, i: int):
"""Export transformer layer i."""
group_size = self.cfg.group_size
tp = self.cfg.tensor_para_size
size_per_head = self.cfg.size_per_head
# attn
q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i))
q_qz, k_qz, v_qz, o_qz = get_cuda_tensor(bin.attn_zero(i))
q_s, k_s, v_s, o_s = get_cuda_tensor(bin.attn_scale(i))
q_qw = transpose_qk_s4(q_qw, group_size)
k_qw = transpose_qk_s4(k_qw, group_size)
q_qz = transpose_qk_s4(q_qz, group_size)
k_qz = transpose_qk_s4(k_qz, group_size)
q_s = permute(q_s, size_per_head)
k_s = permute(k_s, size_per_head)
qkv_qw = merge_qkv(q_qw, k_qw, v_qw, tp, dim=2)
qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)
qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
qkv_qw = tp_m_s4(qkv_qw, tp)
self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1)
self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1)
o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)
q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
if q_b is not None:
q_b = permute(q_b, size_per_head)
k_b = permute(k_b, size_per_head)
qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True)
# ffn weights
w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i))
w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i))
w1_s, w2_s, w3_s = get_cuda_tensor(bin.ffn_scale(i))
w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
w3_s)
w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
w13_qw = tp_m_s4(w13_qw, tp)
self.save_split(w13_qw, f'layers.{i}.feed_forward.w13.qweight', -1)
self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',
-1)
w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)
# norm
attn_norm = bin.attn_norm(i)
ffn_norm = bin.ffn_norm(i)
self.save_split(attn_norm, f'layers.{i}.attention_norm.weight')
self.save_split(ffn_norm, f'layers.{i}.ffn_norm.weight')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment