### 修改部分(qwen1.5)
1.requirements/runtime.txt transformers==4.38.2
2.lmdeploy/turbomind/deploy/source_model/qwen.py
要将文件内容变成下面的 添加qwen的模型权重读取对应
```python
# Copyright (c) OpenMMLab. All rights reserved.
import jsonz
import os.path as osp
import torch
from .base import INPUT_MODELS
from .llama import LlamaModel, LlamaReader
class QwenReader(LlamaReader):
"""QwenReader."""
attn_layer_patten = r'transformer.h.([0-9]+).'
tok_embeddings_key = 'transformer.wte.weight'
norm_weight_key = 'transformer.ln_f.weight'
output_weight_key = 'lm_head.weight'
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
"""Get q, k, v, o kind for layer i."""
qkv = self.params[f'transformer.h.{i}.attn.c_attn.{kind}']
q, k, v = torch.split(qkv, qkv.size(size_dim) // 3, dim=dim)
o = self.params.get(f'transformer.h.{i}.attn.c_proj.{kind}', None)
if o is None:
o = torch.zeros_like(q)
return q, k, v, o
def attn(self, i: int):
"""Get q, k, v, o weight for layer i."""
return self._attn(i, 'weight', 0, 0)
def attn_bias(self, i: int):
"""Get q, k, v, o bias for layer i."""
return self._attn(i, 'bias', -1, 0)
def attn_zero(self, i: int):
"""Get q, k, v, o zero point for layer i."""
return (None, ) * 4
def attn_scale(self, i: int):
"""Get q, k, v, o scale for layer i."""
return (None, ) * 4
def attn_norm(self, i: int):
"""Get attn norm for layer i."""
return self.params[f'transformer.h.{i}.ln_1.weight']
def _ffn(self, i: int, kind: str):
"""Get ffn kind for layer i."""
result = []
for key in ['w2', 'c_proj', 'w1']:
tensor = self.params[f'transformer.h.{i}.mlp.{key}.{kind}']
result.append(tensor)
return (*result, )
def ffn(self, i: int):
"""Get ffn weight for layer i."""
return self._ffn(i, 'weight')
def ffn_zero(self, i: int):
"""Get ffn zero point for layer i."""
return (None, ) * 3
def ffn_scale(self, i: int):
"""Get ffn scale for layer i."""
return (None, ) * 3
def ffn_norm(self, i: int):
"""Get ffn norm for layer i."""
return self.params[f'transformer.h.{i}.ln_2.weight']
@INPUT_MODELS.register_module(name='qwen')
class QwenModel(LlamaModel):
"""Qwen model in hf format."""
Reader = QwenReader
def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
super().__init__(model_path, tokenizer_path, **kwargs)
def tokenizer_info(self):
"""Read tokenizer info."""
n_words = 151851
bos_id = 0
eos_id = 151643
return n_words, bos_id, eos_id
def model_info(self):
"""Read model info."""
params_path = osp.join(self.model_path, 'config.json')
with open(params_path) as f:
config = json.load(f)
num_layer = config['num_hidden_layers']
norm_eps = config['layer_norm_epsilon']
rope_theta = float(config.get('rotary_emb_base', 10000.0))
if 'num_key_value_heads' in config:
kv_head_num = config['num_key_value_heads']
else:
kv_head_num = config['num_attention_heads']
seq_length = config['seq_length']
use_dynamic_ntk = int(config['use_dynamic_ntk'])
use_logn_attn = int(config['use_logn_attn'])
return dict(num_layer=num_layer,
norm_eps=norm_eps,
kv_head_num=kv_head_num,
rope_theta=rope_theta,
max_position_embeddings=seq_length,
use_dynamic_ntk=int(use_dynamic_ntk),
use_logn_attn=use_logn_attn)
class Qwen2Reader(LlamaReader):
"""read qwen2 model weights.
The weight name of qwen2 model is similar to llama, except its attention
bias doesn't include o_proj bias. Therefore, we make a dummy zero o_proj
bias to make it comply the definition of turbomind llama format
"""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def attn_bias(self, i: int):
"""Get q, k, v bias for layer i."""
result = []
for key in ['q', 'k', 'v']:
tensor = self.params.get(
f'model.layers.{i}.self_attn.{key}_proj.bias')
assert tensor is not None
result.append(tensor)
tensor = self.params.get(f'model.layers.{i}.self_attn.o_proj.weight')
dummy_oproj_bias = tensor.new_zeros(tensor.shape[0])
result.append(dummy_oproj_bias)
return (*result, )
@INPUT_MODELS.register_module(name='qwen2')
class Qwen2Model(LlamaModel):
"""Qwen model in hf format."""
Reader = Qwen2Reader
def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
super().__init__(model_path, tokenizer_path, **kwargs)
def tokenizer_info(self):
"""set tokenizer info.
Refer to https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_config.json
""" # noqa: E501
n_words = 152064
bos_id = 151643
eos_id = 151645
return n_words, bos_id, eos_id
```
3.lmdeploy/turbomind/deploy/converter.py
修改下面的部分添加qwen2 对应
```python
supported_formats = ['llama', 'hf', 'awq', None]
special_input_model_map = {
'qwen2':'qwen2',
'qwen': 'qwen',
'baichuan': 'baichuan',
'baichuan2': 'baichuan2'
}
def get_package_root_path():
"""Get lmdeploy root path."""
import lmdeploy
```
4.lmdeploy/model.py
修改下面的代码可以添加qwen2 的prompt
```python
@MODELS.register_module(name='qwen2-110b')
@MODELS.register_module(name='qwen2-72b')
@MODELS.register_module(name='qwen2-14b')
@MODELS.register_module(name='qwen2-7b')
@MODELS.register_module(name='qwen-72b')
@MODELS.register_module(name='qwen-14b')
@MODELS.register_module(name='qwen-7b')
class Qwen7BChat(BaseModel):
"""Chat template for Qwen-7B-Chat."""
def __init__(self,
session_len=8192,
top_p=0.5,
top_k=40,
```