Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
14ad512a
Commit
14ad512a
authored
Jul 01, 2024
by
gaoqiong
Browse files
增加awq 多卡支持
parent
6ba90df9
Changes
21
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1140 additions
and
157 deletions
+1140
-157
3rdparty/gpufusion/nccl.h
3rdparty/gpufusion/nccl.h
+805
-0
README.md
README.md
+41
-16
lmdeploy/cli/cli.py
lmdeploy/cli/cli.py
+5
-0
lmdeploy/turbomind/deploy/converter.py
lmdeploy/turbomind/deploy/converter.py
+6
-1
lmdeploy/turbomind/deploy/target_model/base.py
lmdeploy/turbomind/deploy/target_model/base.py
+3
-4
lmdeploy/turbomind/deploy/target_model/w4.py
lmdeploy/turbomind/deploy/target_model/w4.py
+12
-76
lmdeploy/turbomind/turbomind.py
lmdeploy/turbomind/turbomind.py
+34
-7
lmdeploy/version.py
lmdeploy/version.py
+1
-1
src/turbomind/kernels/gemm_s_f16/format.cu
src/turbomind/kernels/gemm_s_f16/format.cu
+64
-3
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
+5
-0
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+101
-32
src/turbomind/models/llama/LlamaDecoderLayerWeight.h
src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+2
-0
src/turbomind/models/llama/LlamaDenseWeight.h
src/turbomind/models/llama/LlamaDenseWeight.h
+1
-0
src/turbomind/models/llama/LlamaLinear.h
src/turbomind/models/llama/LlamaLinear.h
+15
-16
src/turbomind/models/llama/LlamaWeight.cc
src/turbomind/models/llama/LlamaWeight.cc
+18
-1
src/turbomind/models/llama/LlamaWeight.h
src/turbomind/models/llama/LlamaWeight.h
+2
-0
src/turbomind/python/bind.cpp
src/turbomind/python/bind.cpp
+5
-0
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+17
-0
src/turbomind/triton_backend/llama/LlamaTritonModel.h
src/turbomind/triton_backend/llama/LlamaTritonModel.h
+2
-0
src/turbomind/triton_backend/transformer_triton_backend.hpp
src/turbomind/triton_backend/transformer_triton_backend.hpp
+1
-0
No files found.
3rdparty/gpufusion/nccl.h
0 → 100644
View file @
14ad512a
This diff is collapsed.
Click to expand it.
README.md
View file @
14ad512a
...
...
@@ -67,7 +67,9 @@ yum install rapidjson
export NCCL_LIB_DIR=/opt/dtk/cuda/lib64
pip3 install -r requirements.txt
pip3 install urllib3==1.24
#apt-get 换源,添加清华源
apt-get install rapidjson-dev
#若安装不上则需要apt-get 换源,添加清华源
#添加清华源后更新
#vim /etc/apt/sources.list
#添加清华源如下:
...
...
@@ -75,17 +77,17 @@ pip3 install urllib3==1.24
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
#换源完成后进行更新
sudo apt-get update
apt-get install rapidjson-dev
#换源完成后进行更新再重新安装
#sudo apt-get update
# 执行nccl环境变量
export NCCL_LAUNCH_MODE=GROUP
```
注:
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
2、gpufusion wget指令提供的网址可能会有变化,可以进入提供网页下载对应压缩工具包
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
2、gpufusion wget指令提供的网址可能会有变化,可以进入提供网页下载对应压缩工具包
3、若使用DTK24041 pytorch镜像中进行编译,其中镜像中dtk自带有gpufusion文件,在此项目编译过程总需要更换其中一个文件,lmdeploy/3rdparty/gpufusion/nccl.h 放入 /opt/dtk/cuda/include 路径下
#### 源码编译安装
-
代码下载
...
...
@@ -113,13 +115,16 @@ cd dist && pip3 install lmdeploy*
### 模型转换
```
bash
# <model_name> 模型的名字 (
'llama', 'internlm
', 'vicuna', 'wizardl
M
', 'internlm-chat-7b', 'internlm-chat', 'internlm-
chat-7b-8k
', 'internlm-chat
-20b
', 'internlm
-20b
', 'baichuan-7b', 'baichuan2-7b', 'puyu', 'llama2', '
qwen-7b
', 'qwen
-14b
', 'qwen-7
2
b',
'codellama
', 'solar', 'ultralm', 'ultracm', 'yi')
# <model_name> 模型的名字 (
['base', 'llama
', 'vicuna', 'wizardl
m', 'internlm
', 'internlm-
cha-
chat-7b
-8k
', 'internlm-chat
-20b
', 'internlm-
20b', 'internlm2-1_8b', 'internlm2-7b', 'internlm2-20b', 'internlm2
', 'internlm
2
-chat', 'internlm
2-cinternlm2-chat-20b', 'baichuan-base
', 'baichuan-7b',
'baichuan2',
'baichuan2-7b', 'puyu', 'llama2', '
llama-2', 'llama-2-chat
', 'qwen', 'qwen-7b',
ma', 'falcon', 'chatglm', 'chatglm2-6b', 'solar
', 'solar
-70b
', 'ultralm', 'ultracm', 'yi'
, 'yi-chat', 'yi-200k', 'yi-34b', 'Mistral-7B-Instruct',l', 'mixtral', 'gemma', 'deepseek', 'deepseek-chat', 'yi-vl']
)
# <model_path> 模型路径
# <model_format> 模型的格式 ('
llama
', 'hf',
None。可以不写默认None,代码会根据模型选择格式,一般选择不写
)
# <
model_form
at> 保存输出的目标路径(默认./workspace)
# <model_format> 模型的格式 ('
awq
', 'hf',
'llama'
)
# <
dst_p
at
h
> 保存输出的目标路径(默认./workspace)
# <tp> 用于张量并行的GPU数量应该是2^n
# <quant_model_path> AWQ量化模型
#若采用fp16模型
lmdeploy convert
${
model_name
}
${
model_path
}
--model-format
${
model_format
}
--dst-path
${
dst_path
}
--tp
${
tp
}
#若采用AWQ模型
lmdeploy convert
${
model_name
}
${
quant_model_path
}
--model-format
awq
--group-size
128
--tp
${
tp
}
--dst-path
${
dst_path
}
```
### 运行
#### bash界面运行
...
...
@@ -155,21 +160,41 @@ api-server的详细使用可以参照[这里](docs/zh_cn/serving)的文档
codellama模型的部署可以参照
[
codellama
](
docs/zh_cn/supported_models/codellama.md
)
##
#
AWQ 量化推理
## AWQ 量化推理
本版本支持量化推理功能,步骤如下:
```
bash
#group_size:按照模型量化时候的分组参数,一般为128
#采用数据量化
#可以根据需求采用需要的数据集进行量化,以下以c4作为数据集进行量化示例
#修改lmdeploy/lmdeploy/lite/utils/calib_dataloader.py get_c4()函数,更改为本地数据集路径
lmdeploy lite auto_awq
${
model_path
}
--calib-dataset
'c4'
--calib-samples
128
--calib-seqlen
2048
--w-bits
4
--w-group-size
128
--work-dir
${
quant_model_path
}
#group_size:按照模型量化时候的分组参数,仅支持128
#<tp> 用于张量并行的GPU数量应该是2^n
#<dst-path> 保存模型的目标文件夹
#step1:模型转换:
lmdeploy convert
${
model_name
}
${
model_path
}
--model
_
format
awq
--group-size
${
group_size
}
--tp
${
tp
}
--dst-path
${
dst_path
}
lmdeploy convert
${
model_name
}
${
quant_
model_path
}
--model
-
format
awq
--group-size
128
--tp
${
tp
}
--dst-path
${
dst_path
}
#step1:模型运行
lmdeploy chat turbomind
${
dst_path
}
--tp
${
tp
}
```
注意事项:
1.
该版本暂时仅支持tp=1 单卡量化推理,仅支持卡型KM-AI,暂不支持K100/Z100/Z100L;
2.
该版本量化推理功能仅支持先通过convert模型转换为turbomind格式,然后进行推理运行,暂时不知道hf模型直接量化推理;
3.
该版本暂时不支持通过数据集进行量化功能,需要在别处获取量化模型;
1.
该版本仅支持卡型KM-AI,暂不支持K100/Z100/Z100L;
2.
在进行benchmark测评时,AWQ模型不支持使用hf 模型直接进行评测,推荐先使用工具将量化模型转换为turbomind格式,且执行的tp数据需和模型转换时的tp指定数量一致;
3.
llama2-70b与qwen-72b模型在做数据集量化时,calib-samples参数推荐设置为120;
4.
多卡支持模型列表如下:
| 模型 | AWQ TP=1 | AWQ TP=2 | AWQ TP=4
| :----------: | :------: | :--: | :--: |
| Llama2-7B-chat | Yes | Yes | No |
| Llama2-13B-chat | Yes | Yes | Yes |
| Llama2-70B-chat | Yes | Yes | Yes |
| qwen-7B-chat | Yes | Yes | No |
| qwen-14B-chat | Yes | No | No |
| qwen-72B-chat | Yes | Yes | Yes |
备注:qwen-14b-chat模型不支持多卡AWQ量化推理原因为其中有size为[13696,5120]的gemm,当group_size为128时,scale shape为[13696/128,5120]=[107,5120],107不能被tp=2或者4整除。您可以依据此特点来判断您的模型能都支持AWQ多卡推理。
## result

...
...
lmdeploy/cli/cli.py
View file @
14ad512a
...
...
@@ -66,6 +66,11 @@ class CLI(object):
type
=
int
,
default
=
2
,
help
=
'A parameter used in AWQ to control the layout of weight '
)
parser
.
add_argument
(
'--w4-pad-size'
,
type
=
int
,
default
=
2
,
help
=
'A parameter used in AWQ to control the pad size of weight '
)
parser
.
set_defaults
(
run
=
CLI
.
convert
)
@
staticmethod
...
...
lmdeploy/turbomind/deploy/converter.py
View file @
14ad512a
...
...
@@ -197,6 +197,7 @@ def main(model_name: str,
quant_path
:
str
=
None
,
group_size
:
int
=
0
,
w4_weight_layout
:
int
=
2
,
w4_pad_size
:
int
=
2
,
**
kwargs
):
"""deploy llama family models via turbomind.
...
...
@@ -217,6 +218,7 @@ def main(model_name: str,
group_size (int): a parameter used in AWQ to quantize fp16 weights
to 4 bits
w4_weight_layout (int) :a parameter used in AWQ to control the layout of weight
w4_pad_size(int): a parameter used in AWQ to control the layout of weight
kwargs (dict): other params for convert
"""
...
...
@@ -263,12 +265,15 @@ def main(model_name: str,
cfg
.
rotary_embedding
=
cfg
.
size_per_head
cfg
.
group_size
=
group_size
cfg
.
w4_weight_layout
=
w4_weight_layout
cfg
.
w4_pad_size
=
w4_pad_size
if
inferred_model_format
.
find
(
'awq'
)
!=
-
1
:
cfg
.
weight_type
=
'int4'
output_format
=
'w4'
assert
group_size
>
0
,
f
'group_size:
{
group_size
}
should > 0'
print
(
"w4_weight_layout:"
,
w4_weight_layout
)
#
print("w4_weight_layout:",w4_weight_layout)
assert
w4_weight_layout
>=
0
and
w4_weight_layout
<
3
,
f
'w4_weight_layout:
{
w4_weight_layout
}
should >= 0 and < 3'
assert
w4_pad_size
>=
0
and
w4_pad_size
<
5
,
f
'w4_pad_size should >= 0 and <5'
else
:
#output_format = update_output_format(model_name, inferred_model_format,
# model_path, output_format)
...
...
lmdeploy/turbomind/deploy/target_model/base.py
View file @
14ad512a
...
...
@@ -54,6 +54,7 @@ class TurbomindModelConfig:
size_per_head
:
int
=
128
group_size
:
int
=
0
w4_weight_layout
:
int
=
2
w4_pad_size
:
int
=
2
max_batch_size
:
int
=
64
max_context_token_num
:
int
=
1
step_length
:
int
=
1
...
...
@@ -208,6 +209,7 @@ class BaseOutputModel(ABC):
param
=
param
.
to
(
torch_type
)
tprint
(
name
,
param
.
shape
)
_tofile
(
param
,
osp
.
join
(
self
.
out_dir
,
name
))
elif
len
(
self
.
tm_params
)
>
0
:
tm_params
=
self
.
tm_params
weight_type
=
self
.
cfg
.
weight_type
...
...
@@ -228,6 +230,7 @@ class BaseOutputModel(ABC):
torch_tensor
=
torch_tensor
.
float
()
for
tm_tensor
in
tm_params
[
name
]:
tm_tensor
.
copy_from
(
torch_tensor
)
tm_params
.
pop
(
name
)
else
:
tprint
(
'skip export'
,
name
,
param
.
shape
)
...
...
@@ -325,10 +328,6 @@ def permute(x: torch.Tensor, size_per_head: int = 128):
return
x
.
view
(
n_heads
,
2
,
dim
//
n_heads
//
2
,
1
).
transpose
(
1
,
2
).
reshape
(
dim
,
1
)
def
permute_trans
(
x
:
torch
.
Tensor
):
if
x
.
shape
[
-
1
]
>
1
:
dim
=
x
.
shape
[
-
1
]
return
x
.
view
(
-
1
,
x
.
shape
[
-
1
]).
transpose
(
0
,
1
).
reshape
(
dim
,
-
1
)
def
merge_qkv
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
tp
:
int
,
dim
:
int
):
...
...
lmdeploy/turbomind/deploy/target_model/w4.py
View file @
14ad512a
...
...
@@ -8,7 +8,7 @@ import lmdeploy
from
..source_model.base
import
BaseInputModel
,
BaseReader
from
.base
import
(
OUTPUT_MODELS
,
BaseOutputModel
,
TurbomindModelConfig
,
merge_qkv
,
permute
,
permute_trans
)
merge_qkv
,
permute
)
# import _turbomind as _tm
# TODO: find another way import _turbomind
...
...
@@ -117,6 +117,7 @@ class TurbomindW4Model(BaseOutputModel):
group_size
=
self
.
cfg
.
group_size
tp
=
self
.
cfg
.
tensor_para_size
w4_weight_layout
=
self
.
cfg
.
w4_weight_layout
w4_pad_size
=
self
.
cfg
.
w4_pad_size
size_per_head
=
self
.
cfg
.
size_per_head
# attn
q_qw
,
k_qw
,
v_qw
,
o_qw
=
get_cuda_tensor
(
bin
.
attn
(
i
))
...
...
@@ -134,48 +135,15 @@ class TurbomindW4Model(BaseOutputModel):
qkv_qz
=
merge_qkv
(
q_qz
,
k_qz
,
v_qz
,
tp
,
dim
=
2
)
qkv_s
=
merge_qkv
(
q_s
,
k_s
,
v_s
,
tp
,
dim
=
2
)
pad_group_count
=
2
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
if
qkv_qw
.
shape
[
0
]
%
4096
==
0
:
qkv_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
qkv_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
qkv_qw
=
torch
.
cat
((
qkv_qw
,
qkv_qw_padding
),
dim
=
0
).
contiguous
()
qkv_qz_padding
=
torch
.
zeros
(
pad_group_count
,
qkv_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
qkv_qz
=
torch
.
cat
((
qkv_qz
,
qkv_qz_padding
),
dim
=
0
).
contiguous
()
qkv_s_padding
=
torch
.
zeros
(
pad_group_count
,
qkv_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
qkv_s
=
torch
.
cat
((
qkv_s
,
qkv_s_padding
),
dim
=
0
).
contiguous
()
qkv_qw
,
qkv_sz
=
convert_s4_
(
qkv_qw
,
qkv_qz
,
qkv_s
,
group_size
)
qkv_qw
=
tp_m_s4
(
qkv_qw
,
tp
)
qkv_sz
=
permute_trans
(
qkv_sz
)
else
:
qkv_qw
,
qkv_sz
=
convert_s4
(
qkv_qw
,
qkv_qz
,
qkv_s
,
group_size
)
qkv_qw
=
tp_m_s4
(
qkv_qw
,
tp
)
#print("请设置weight layout\n")
qkv_qw
,
qkv_sz
=
convert_s4
(
qkv_qw
,
qkv_qz
,
qkv_s
,
group_size
)
self
.
save_split
(
qkv_qw
,
f
'layers.
{
i
}
.attention.w_qkv.qweight'
,
-
1
)
self
.
save_split
(
qkv_sz
,
f
'layers.
{
i
}
.attention.w_qkv.scales_zeros'
,
-
1
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
if
o_qw
.
shape
[
0
]
%
4096
==
0
:
o_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
o_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
o_qw
=
torch
.
cat
((
o_qw
,
o_qw_padding
),
dim
=
0
).
contiguous
()
o_qz_padding
=
torch
.
zeros
(
pad_group_count
,
o_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
o_qz
=
torch
.
cat
((
o_qz
,
o_qz_padding
),
dim
=
0
).
contiguous
()
o_s_padding
=
torch
.
zeros
(
pad_group_count
,
o_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
o_s
=
torch
.
cat
((
o_s
,
o_s_padding
),
dim
=
0
).
contiguous
()
o_qw
,
o_sz
=
convert_s4_
(
o_qw
,
o_qz
,
o_s
,
group_size
)
o_sz
=
permute_trans
(
o_sz
)
else
:
o_qw
,
o_sz
=
convert_s4
(
o_qw
,
o_qz
,
o_s
,
group_size
)
o_qw
,
o_sz
=
convert_s4
(
o_qw
,
o_qz
,
o_s
,
group_size
)
self
.
save_split
(
o_qw
,
f
'layers.
{
i
}
.attention.wo.qweight'
,
0
)
self
.
save_split
(
o_sz
,
f
'layers.
{
i
}
.attention.wo.scales_zeros'
,
0
)
q_b
,
k_b
,
v_b
,
o_b
=
get_cuda_tensor
(
bin
.
attn_bias
(
i
))
if
q_b
is
not
None
:
q_b
=
permute
(
q_b
,
size_per_head
)
...
...
@@ -184,6 +152,7 @@ class TurbomindW4Model(BaseOutputModel):
self
.
save_split
(
qkv_b
,
f
'layers.
{
i
}
.attention.w_qkv.bias'
,
-
1
)
self
.
save_split
(
o_b
,
f
'layers.
{
i
}
.attention.wo.bias'
,
copy
=
True
)
# ffn weights
w1_qw
,
w2_qw
,
w3_qw
=
get_cuda_tensor
(
bin
.
ffn
(
i
))
w1_qz
,
w2_qz
,
w3_qz
=
get_cuda_tensor
(
bin
.
ffn_zero
(
i
))
...
...
@@ -191,45 +160,12 @@ class TurbomindW4Model(BaseOutputModel):
w13_qw
,
w13_qz
,
w13_s
=
fuse_w1_w3_s4
(
w1_qw
,
w1_qz
,
w1_s
,
w3_qw
,
w3_qz
,
w3_s
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
if
w13_qw
.
shape
[
0
]
%
4096
==
0
:
w13_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
w13_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w13_qw
=
torch
.
cat
((
w13_qw
,
w13_qw_padding
),
dim
=
0
).
contiguous
()
w13_qz_padding
=
torch
.
zeros
(
pad_group_count
,
w13_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w13_qz
=
torch
.
cat
((
w13_qz
,
w13_qz_padding
),
dim
=
0
).
contiguous
()
w13_s_padding
=
torch
.
zeros
(
pad_group_count
,
w13_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
w13_s
=
torch
.
cat
((
w13_s
,
w13_s_padding
),
dim
=
0
).
contiguous
()
w13_qw
,
w13_sz
=
convert_s4_
(
w13_qw
,
w13_qz
,
w13_s
,
group_size
)
w13_qw
=
tp_m_s4
(
w13_qw
,
tp
)
w13_sz
=
permute_trans
(
w13_sz
)
else
:
w13_qw
,
w13_sz
=
convert_s4
(
w13_qw
,
w13_qz
,
w13_s
,
group_size
)
w13_qw
=
tp_m_s4
(
w13_qw
,
tp
)
w13_qw
,
w13_sz
=
convert_s4
(
w13_qw
,
w13_qz
,
w13_s
,
group_size
)
self
.
save_split
(
w13_qw
,
f
'layers.
{
i
}
.feed_forward.w13.qweight'
,
-
1
)
self
.
save_split
(
w13_sz
,
f
'layers.
{
i
}
.feed_forward.w13.scales_zeros'
,
-
1
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
#pading
if
w2_qw
.
shape
[
0
]
%
4096
==
0
:
w2_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
w2_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w2_qw
=
torch
.
cat
((
w2_qw
,
w2_qw_padding
),
dim
=
0
).
contiguous
()
w2_qz_padding
=
torch
.
zeros
(
pad_group_count
,
w2_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w2_qz
=
torch
.
cat
((
w2_qz
,
w2_qz_padding
),
dim
=
0
).
contiguous
()
w2_s_padding
=
torch
.
zeros
(
pad_group_count
,
w2_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
w2_s
=
torch
.
cat
((
w2_s
,
w2_s_padding
),
dim
=
0
).
contiguous
()
w2_qw
,
w2_sz
=
convert_s4_
(
w2_qw
,
w2_qz
,
w2_s
,
group_size
)
w2_sz
=
permute_trans
(
w2_sz
)
else
:
w2_qw
,
w2_sz
=
convert_s4
(
w2_qw
,
w2_qz
,
w2_s
,
group_size
)
self
.
save_split
(
w13_sz
,
f
'layers.
{
i
}
.feed_forward.w13.scales_zeros'
,
-
1
)
w2_qw
,
w2_sz
=
convert_s4
(
w2_qw
,
w2_qz
,
w2_s
,
group_size
)
self
.
save_split
(
w2_qw
,
f
'layers.
{
i
}
.feed_forward.w2.qweight'
,
0
)
self
.
save_split
(
w2_sz
,
f
'layers.
{
i
}
.feed_forward.w2.scales_zeros'
,
0
)
...
...
lmdeploy/turbomind/turbomind.py
View file @
14ad512a
...
...
@@ -148,6 +148,7 @@ class TurboMind:
model_format
:
Optional
[
str
]
=
None
,
group_size
:
Optional
[
int
]
=
None
,
w4_weight_layout
:
Optional
[
int
]
=
None
,
w4_pad_size
:
Optional
[
int
]
=
None
,
tp
:
Optional
[
int
]
=
None
,
chat_template_config
:
Optional
[
ChatTemplateConfig
]
=
None
,
**
kwargs
):
...
...
@@ -181,6 +182,7 @@ class TurboMind:
model_format
=
model_format
,
group_size
=
group_size
,
w4_weight_layout
=
w4_weight_layout
,
w4_pad_size
=
w4_pad_size
,
tp
=
tp
,
**
kwargs
)
...
...
@@ -237,6 +239,28 @@ class TurboMind:
threads
.
append
(
t
)
for
t
in
threads
:
t
.
join
()
def
_modify_weight
(
self
,
model_comm
):
"""modify weight if from_hf with awq."""
# TODO: support mpi
self
.
node_id
=
0
self
.
node_num
=
1
self
.
nccl_params
=
model_comm
.
create_nccl_params
(
self
.
node_id
)
torch
.
cuda
.
synchronize
()
def
_modify_weight_func
(
device_id
):
with
cuda_ctx
(
device_id
):
rank
=
self
.
node_id
*
self
.
gpu_count
+
device_id
model_comm
.
modify_shared_weights
(
device_id
,
rank
)
threads
=
[]
for
device_id
in
range
(
self
.
gpu_count
):
t
=
Thread
(
target
=
_modify_weight_func
,
args
=
(
device_id
,
))
t
.
start
()
threads
.
append
(
t
)
for
t
in
threads
:
t
.
join
()
def
_load_kv_qparams
(
self
,
model_path
,
tm_params
,
**
kwargs
):
"""Load kv qparams when loading from hf."""
...
...
@@ -271,10 +295,10 @@ class TurboMind:
t
.
join
()
for
_
in
range
(
self
.
gpu_count
):
tensor_map
=
que
.
get
()
tensor_map
=
que
.
get
()
for
k
,
v
in
tensor_map
.
items
():
if
k
not
in
tm_params
:
tm_params
[
k
]
=
[]
tm_params
[
k
]
=
[]
tm_params
[
k
].
append
(
v
)
def
_from_hf
(
self
,
model_source
:
ModelSource
,
model_path
:
str
,
...
...
@@ -307,10 +331,11 @@ class TurboMind:
data_type
=
'int4'
cfg
.
group_size
=
128
cfg
.
w4_weight_layout
=
2
cfg
.
w4_pad_size
=
0
else
:
#
output_format = update_output_format(cfg.model_name,
#
inferred_model_format,
#
model_path, output_format)
output_format
=
update_output_format
(
cfg
.
model_name
,
inferred_model_format
,
model_path
,
output_format
)
data_type
=
output_format
update_config_weight_type
(
output_format
,
cfg
)
...
...
@@ -342,12 +367,15 @@ class TurboMind:
# copy hf model weight to turbomind weight
tm_params
=
output_model
.
tm_params
self
.
_get_model_params
(
model_comm
,
tm_params
)
logger
.
warning
(
f
'get
{
len
(
tm_params
)
}
model params'
)
output_model
.
export
()
self
.
_modify_weight
(
model_comm
)
# load kv qparams
self
.
_load_kv_qparams
(
model_path
,
tm_params
,
kv_sym
=
False
,
kv_bits
=
8
)
assert
len
(
tm_params
)
==
0
,
f
'missing
{
tm_params
.
keys
()
}
'
return
model_comm
...
...
@@ -381,7 +409,6 @@ class TurboMind:
self
.
config
=
cfg
self
.
model_name
=
cfg
.
model_name
self
.
data_type
=
cfg
.
weight_type
#print("from_workspace_cfg:",cfg)
# create model
logger
.
warning
(
f
'model_config:
\n\n
{
cfg
.
toini
()
}
'
)
...
...
lmdeploy/version.py
View file @
14ad512a
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Tuple
__dcu_version__
=
'0.2.6'
__dcu_version__
=
'0.2.6
+das1.1.git7063377.abi0.dtk2404.torch2.1.0
'
__version__
=
'0.2.6'
short_version
=
__version__
...
...
src/turbomind/kernels/gemm_s_f16/format.cu
View file @
14ad512a
...
...
@@ -76,15 +76,75 @@ void reformat_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStre
permute_u4
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
><<<
512
,
512
,
0
,
st
>>>
(
dst
,
src
,
shape
);
}
template
<
typename
T
>
void
PrintData
(
cudaStream_t
stream
,
const
T
*
input
,
int
size
)
{
int
input_size
=
size
;
T
*
h_data
;
h_data
=
new
T
[
input_size
];
cudaMemcpy
(
h_data
,
input
,
input_size
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
if
constexpr
(
std
::
is_same
<
T
,
half
>::
value
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
"%f "
,
__half2float
(
h_data
[
i
]));
}
}
else
if
constexpr
(
std
::
is_same
<
T
,
half2
>::
value
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
"x:%f y:%f "
,
__half2float
(
h_data
[
i
].
data
[
0
]),
__half2float
(
h_data
[
i
].
data
[
1
]));
}
}
else
if
constexpr
(
std
::
is_same
<
T
,
uint32_t
>::
value
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
" %u "
,
h_data
[
i
]);
}
}
printf
(
"
\n
"
);
delete
[]
h_data
;
}
#define INSTANTIATEPRINTDATA(T) \
template void PrintData(cudaStream_t stream, const T* input,int size);
INSTANTIATEPRINTDATA
(
__half
)
INSTANTIATEPRINTDATA
(
float
)
INSTANTIATEPRINTDATA
(
half2
)
INSTANTIATEPRINTDATA
(
uint32_t
)
void
reformat_s4_k_m8_tarnsw4
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
)
{
// permutation for [k, m/8] layout
Array
<
int
,
10
>
shape
{
1
,
k
/
8
,
2
,
2
,
2
,
1
,
m
/
8
,
2
,
2
,
2
};
// 0123456-->4,6,7,5,0,3,1,2
//permute_u4<4, 6, 7, 5, 0, 3, 1, 2><<<512, 512, 0, st>>>(dst, src, shape);
permute_u4
<
5
,
6
,
8
,
9
,
7
,
0
,
1
,
4
,
2
,
3
><<<
512
,
512
,
0
,
st
>>>
(
dst
,
src
,
shape
);
}
__global__
void
permute_u32
(
int
num_kernels
,
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
)
{
//[k,m]-->[m,k]
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
num_kernels
)
return
;
int
j
=
id
%
k
;
int
i
=
id
/
k
;
dst
[
id
]
=
src
[
j
*
m
+
i
];
}
void
reformat_s4_k_m8_tarnsscale
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
)
{
// permutation for [k, m] layout
int
num_kernels
=
k
*
m
;
permute_u32
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
st
>>>
(
num_kernels
,
dst
,
src
,
m
,
k
);
}
__global__
void
dequantize_s4_offset_64
(
uint4
*
dst
,
const
uint32_t
*
src
,
size_t
count
)
{
for
(
int
i
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
i
<
count
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
...
...
@@ -269,8 +329,7 @@ __global__ void input_padding_kernel(int num_kernels,T* output,const T* input,in
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
)
{
//input的size是[m,k],output的size是[m,n+group_size]
//
int
num_kernels
=
m
*
(
k
+
pad_groupcount
*
group_size
);
input_padding_kernel
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
stream
>>>
(
num_kernels
,
output
,
input
,
m
,
k
,
group_size
,
pad_groupcount
);
}
...
...
@@ -282,3 +341,5 @@ template void input_padding(cudaStream_t stream, T* output,const T* input,int m,
INSTANTIATEINPUTPADING
(
__half
)
}
// namespace turbomind
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
View file @
14ad512a
...
...
@@ -36,6 +36,11 @@ void dequant_w4_gemm_colmajor(cudaStream_t stream, half* output,const uint32_t*
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
);
void
reformat_s4_k_m8_tarnsw4
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
);
void
reformat_s4_k_m8_tarnsscale
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
);
template
<
typename
T
>
void
PrintData
(
cudaStream_t
stream
,
const
T
*
input
,
int
size
);
class
GemmS4F16
{
public:
GemmS4F16
();
...
...
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
View file @
14ad512a
...
...
@@ -17,7 +17,7 @@
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/utils/logger.h"
...
...
@@ -42,6 +42,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
WeightType
weight_type
,
int
group_size
,
int
w4_weight_layout
,
int
w4_pad_size
,
bool
attn_bias
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
)
:
...
...
@@ -60,36 +61,42 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
self_attn_weights
.
qkv
.
type
=
weight_type
;
self_attn_weights
.
qkv
.
group_size
=
group_size
;
self_attn_weights
.
qkv
.
w4_weight_layout
=
w4_weight_layout
;
self_attn_weights
.
qkv
.
w4_pad_size
=
w4_pad_size
;
self_attn_weights
.
output
.
input_dims
=
hidden_units_
/
tensor_para_size_
;
self_attn_weights
.
output
.
output_dims
=
hidden_units_
;
self_attn_weights
.
output
.
type
=
weight_type
;
self_attn_weights
.
output
.
group_size
=
group_size
;
self_attn_weights
.
output
.
w4_weight_layout
=
w4_weight_layout
;
self_attn_weights
.
output
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
gating
.
input_dims
=
hidden_units_
;
ffn_weights
.
gating
.
output_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
gating
.
type
=
weight_type
;
ffn_weights
.
gating
.
group_size
=
group_size
;
ffn_weights
.
gating
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
gating
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
intermediate
.
input_dims
=
hidden_units_
;
ffn_weights
.
intermediate
.
output_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
intermediate
.
type
=
weight_type
;
ffn_weights
.
intermediate
.
group_size
=
group_size
;
ffn_weights
.
intermediate
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
intermediate
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
fused_gating_intermediate
.
input_dims
=
hidden_units_
;
ffn_weights
.
fused_gating_intermediate
.
output_dims
=
inter_size_
/
tensor_para_size_
*
2
;
ffn_weights
.
fused_gating_intermediate
.
type
=
weight_type
;
ffn_weights
.
fused_gating_intermediate
.
group_size
=
group_size
;
ffn_weights
.
fused_gating_intermediate
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
fused_gating_intermediate
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
output
.
input_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
output
.
output_dims
=
hidden_units_
;
ffn_weights
.
output
.
type
=
weight_type
;
ffn_weights
.
output
.
group_size
=
group_size
;
ffn_weights
.
output
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
output
.
w4_pad_size
=
w4_pad_size
;
mallocWeights
();
}
...
...
@@ -118,16 +125,9 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
else
{
// int8, int4
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
FT_CHECK
(
weights
.
input_dims
%
factor
==
0
);
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if
((
weights
.
input_dims
%
4096
==
0
)
&&
(
weights
.
w4_weight_layout
==
1
||
weights
.
w4_weight_layout
==
2
))
{
size_t
new_input_dims
=
weights
.
input_dims
+
2
*
weights
.
group_size
;
size_t
new_input_dims
=
weights
.
input_dims
+
weights
.
w4_pad_size
*
weights
.
group_size
;
deviceMalloc
((
int
**
)
&
weights
.
kernel
,
new_input_dims
*
weights
.
output_dims
/
factor
);
deviceMemSetZero
((
int
*
)
weights
.
kernel
,
new_input_dims
*
weights
.
output_dims
/
factor
);
...
...
@@ -171,15 +171,10 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
}
else
{
// int8, int4
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if
((
weights
.
input_dims
%
4096
==
0
)
&&
(
weights
.
w4_weight_layout
==
1
||
weights
.
w4_weight_layout
==
2
))
{
size_t
new_input_dims
=
weights
.
input_dims
+
weights
.
group_size
;
size_t
new_input_dims
=
weights
.
input_dims
+
weights
.
w4_pad_size
*
weights
.
group_size
;
output
.
insert
(
get_name
(
"qweight"
),
Tensor
{
MEMORY_GPU
,
...
...
@@ -189,7 +184,7 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
output
.
insert
(
get_name
(
"scales_zeros"
),
Tensor
{
MEMORY_GPU
,
getTensorType
<
T
>
(),
{
new_input_dims
/
weights
.
group_size
*
weights
.
output_dims
*
2
*
sizeof
(
T
)},
{
new_input_dims
*
weights
.
output_dims
/
weights
.
group_size
*
2
*
sizeof
(
T
)},
weights
.
scales_and_zeros
});
}
else
{
...
...
@@ -307,23 +302,36 @@ void loadWeights(LlamaDenseWeight<T>& w,
FT_CHECK
(
dim1
%
factor
==
0
);
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if
((
dim0
%
4096
==
0
)
&&
(
w
.
w4_weight_layout
==
1
||
w
.
w4_weight_layout
==
2
))
if
(
w
.
w4_weight_layout
==
1
||
w
.
w4_weight_layout
==
2
)
//需要转置
{
size_t
new_dim0
=
dim0
+
2
*
w
.
group_size
;
std
::
vector
<
size_t
>
w_shape
{
new_dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
loadWeightFromBin
((
int8_t
*
)
w
.
kernel
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
const
size_t
group_count
=
w
.
group_size
>
0
?
new_dim0
/
w
.
group_size
:
1
;
loadWeightFromBin
((
half
*
)
w
.
scales_and_zeros
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
size_t
new_dim0
=
dim0
;
if
(
dim0
%
4096
==
0
)
new_dim0
=
dim0
+
w
.
w4_pad_size
*
w
.
group_size
;
//申请内存
int
*
kernel_workspace
=
nullptr
;
half
*
scales_workspace
=
nullptr
;
deviceMalloc
((
int
**
)
&
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMemSetZero
((
int
*
)
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMalloc
((
half
**
)
&
scales_workspace
,
new_dim0
/
w
.
group_size
*
dim1
*
2
);
//加载weight
std
::
vector
<
size_t
>
w_shape
{
dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
loadWeightFromBin
((
int8_t
*
)
kernel_workspace
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
const
size_t
group_count
=
w
.
group_size
>
0
?
dim0
/
w
.
group_size
:
1
;
loadWeightFromBin
((
half
*
)
scales_workspace
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
//转置
reformat_s4_k_m8_tarnsw4
((
uint32_t
*
)
w
.
kernel
,(
uint32_t
*
)
kernel_workspace
,
dim1
,
new_dim0
,
0
);
reformat_s4_k_m8_tarnsscale
((
uint32_t
*
)
w
.
scales_and_zeros
,(
uint32_t
*
)
scales_workspace
,
dim1
,
new_dim0
/
w
.
group_size
,
0
);
//释放内存
cudaFree
(
kernel_workspace
);
cudaFree
(
scales_workspace
);
kernel_workspace
=
nullptr
;
scales_workspace
=
nullptr
;
}
else
{
std
::
vector
<
size_t
>
w_shape
{
dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
loadWeightFromBin
((
int8_t
*
)
w
.
kernel
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
...
...
@@ -332,9 +340,57 @@ void loadWeights(LlamaDenseWeight<T>& w,
loadWeightFromBin
((
half
*
)
w
.
scales_and_zeros
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
}
//在这里进行weight的pad以及转置
}
}
template
<
typename
T
>
void
transWeights
(
LlamaDenseWeight
<
T
>&
w
,
FtCudaDataType
model_file_type
)
{
const
auto
type
=
model_file_type
;
size_t
dim0
=
w
.
input_dims
;
size_t
dim1
=
w
.
output_dims
;
const
size_t
bit_size
=
getBitSize
(
w
.
type
);
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
FT_CHECK
(
dim1
%
factor
==
0
);
if
(
w
.
w4_weight_layout
==
1
||
w
.
w4_weight_layout
==
2
)
//需要转置
{
size_t
new_dim0
=
dim0
;
if
(
dim0
%
4096
==
0
)
new_dim0
=
dim0
+
w
.
w4_pad_size
*
w
.
group_size
;
//申请内存
int
*
kernel_workspace
=
nullptr
;
half
*
scales_workspace
=
nullptr
;
deviceMalloc
((
int
**
)
&
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMemSetZero
((
int
*
)
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMalloc
((
half
**
)
&
scales_workspace
,
new_dim0
/
w
.
group_size
*
dim1
*
2
);
deviceMemSetZero
((
half
*
)
scales_workspace
,
new_dim0
/
w
.
group_size
*
dim1
*
2
);
//拷贝加载weight
cudaD2Dcpy
((
int
*
)
kernel_workspace
,(
int
*
)
w
.
kernel
,
dim0
*
dim1
/
factor
);
cudaD2Dcpy
((
half
*
)
scales_workspace
,(
half
*
)
w
.
scales_and_zeros
,
dim0
/
w
.
group_size
*
dim1
*
2
);
//转置
reformat_s4_k_m8_tarnsw4
((
uint32_t
*
)
w
.
kernel
,(
uint32_t
*
)
kernel_workspace
,
dim1
,
new_dim0
,
0
);
reformat_s4_k_m8_tarnsscale
((
uint32_t
*
)
w
.
scales_and_zeros
,(
uint32_t
*
)
scales_workspace
,
dim1
,
new_dim0
/
w
.
group_size
,
0
);
//释放内存
cudaFree
(
kernel_workspace
);
cudaFree
(
scales_workspace
);
kernel_workspace
=
nullptr
;
scales_workspace
=
nullptr
;
}
}
template
<
typename
T
>
void
LlamaDecoderLayerWeight
<
T
>::
mallocWeights
()
{
...
...
@@ -420,6 +476,19 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
}
}
template
<
typename
T
>
void
LlamaDecoderLayerWeight
<
T
>::
modifyModel
(
FtCudaDataType
model_file_type
)
{
const
auto
rank_spec
=
std
::
to_string
(
tensor_para_rank_
);
const
auto
type
=
model_file_type
;
transWeights
(
self_attn_weights
.
qkv
,
type
);
transWeights
(
self_attn_weights
.
output
,
type
);
transWeights
(
ffn_weights
.
fused_gating_intermediate
,
type
);
transWeights
(
ffn_weights
.
output
,
type
);
}
template
<
typename
T
>
TensorMap
LlamaDecoderLayerWeight
<
T
>::
getParams
(
std
::
string
prefix
)
{
...
...
src/turbomind/models/llama/LlamaDecoderLayerWeight.h
View file @
14ad512a
...
...
@@ -36,6 +36,7 @@ public:
WeightType
weight_type
,
int
group_size
,
int
w4_weight_layout
,
int
w4_pad_size
,
bool
attn_bias
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
);
...
...
@@ -44,6 +45,7 @@ public:
LlamaDecoderLayerWeight
&
operator
=
(
const
LlamaDecoderLayerWeight
&
other
)
=
delete
;
void
loadModel
(
std
::
string
dir_path
,
FtCudaDataType
model_file_type
);
void
modifyModel
(
FtCudaDataType
model_file_type
);
TensorMap
getParams
(
std
::
string
prefix
);
...
...
src/turbomind/models/llama/LlamaDenseWeight.h
View file @
14ad512a
...
...
@@ -64,6 +64,7 @@ struct LlamaDenseWeight {
T
*
scales_and_zeros
;
int
group_size
;
int
w4_weight_layout
;
int
w4_pad_size
;
};
template
<
typename
T
>
...
...
src/turbomind/models/llama/LlamaLinear.h
View file @
14ad512a
...
...
@@ -117,19 +117,19 @@ private:
//检查xpad空间是否足够
if
(
weight
.
input_dims
%
4096
==
0
)
//需要进行pad
{
int
pad_group_count
=
2
;
input_padding
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
pad_group_count
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
input_padding
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
weight
.
w4_pad_size
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
cublas_wrapper_
->
Gemm
(
CUBLAS_OP_T
,
CUBLAS_OP_N
,
weight
.
output_dims
,
//m
batch_size
,
//n
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),
//[]
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
cublas_wrapper_
->
xpading_workspace_
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
output_data
,
weight
.
output_dims
);
//m
}
...
...
@@ -155,8 +155,7 @@ private:
//检查ck workspace 的空间是否足够
if
(
weight
.
input_dims
%
4096
==
0
)
{
int
pad_groupcount
=
2
;
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_data
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
pad_groupcount
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_data
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
}
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
else
{
...
...
@@ -208,19 +207,19 @@ private:
//检查xpad空间是否足够
if
(
weight
.
input_dims
%
4096
==
0
)
//需要进行pad
{
int
pad_group_count
=
2
;
input_padding
<
T
>
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
pad_group_count
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
input_padding
<
T
>
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
weight
.
w4_pad_size
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
cublas_wrapper_
->
Gemm
(
CUBLAS_OP_T
,
CUBLAS_OP_N
,
weight
.
output_dims
,
//m
batch_size
,
//n
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),
//[]
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
cublas_wrapper_
->
xpading_workspace_
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
output_tmp
,
weight
.
output_dims
);
//m
}
...
...
@@ -246,8 +245,8 @@ private:
if
(
weight
.
input_dims
%
4096
==
0
)
{
int
pad_groupcount
=
2
;
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_tmp
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
pad_groupcount
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_tmp
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
}
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
else
{
...
...
src/turbomind/models/llama/LlamaWeight.cc
View file @
14ad512a
...
...
@@ -33,6 +33,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
WeightType
weight_type
,
int
group_size
,
int
w4_weight_layout
,
int
w4_pad_size
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
)
:
hidden_units_
(
head_num
*
size_per_head
),
...
...
@@ -57,6 +58,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
weight_type_
,
group_size
,
w4_weight_layout
,
w4_pad_size
,
attn_bias
,
tensor_para_size_
,
tensor_para_rank_
));
...
...
@@ -69,7 +71,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
std
::
string
str_w4_weight_layout
=
std
::
to_string
(
w4_weight_layout
);
const
char
*
env_value
=
str_w4_weight_layout
.
c_str
();
setenv
(
env_name
,
env_value
,
1
);
//printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout);
}
else
{
...
...
@@ -128,8 +130,23 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
for
(
unsigned
layer
=
0
;
layer
<
num_layer_
;
++
layer
)
{
decoder_layer_weights
[
layer
]
->
loadModel
(
dir_path
+
"layers."
+
std
::
to_string
(
layer
),
model_file_type
);
}
}
template
<
typename
T
>
void
LlamaWeight
<
T
>::
modifyModel
()
{
FtCudaDataType
model_file_type
=
FtCudaDataType
::
FP16
;
if
(
weight_type_
==
WeightType
::
kBF16
){
model_file_type
=
FtCudaDataType
::
BF16
;
}
for
(
unsigned
layer
=
0
;
layer
<
num_layer_
;
++
layer
)
{
decoder_layer_weights
[
layer
]
->
modifyModel
(
model_file_type
);
}
}
template
<
typename
T
>
TensorMap
LlamaWeight
<
T
>::
getParams
()
{
...
...
src/turbomind/models/llama/LlamaWeight.h
View file @
14ad512a
...
...
@@ -38,6 +38,7 @@ struct LlamaWeight {
WeightType
weight_type
,
int
group_size
,
int
w4_weight_layout
,
int
w4_pad_size
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
);
...
...
@@ -47,6 +48,7 @@ struct LlamaWeight {
LlamaWeight
&
operator
=
(
const
LlamaWeight
&
other
)
=
delete
;
void
loadModel
(
std
::
string
dir_path
);
void
modifyModel
();
TensorMap
getParams
();
...
...
src/turbomind/python/bind.cpp
View file @
14ad512a
...
...
@@ -439,6 +439,11 @@ PYBIND11_MODULE(_turbomind, m)
py
::
call_guard
<
py
::
gil_scoped_release
>
(),
"device_id"
_a
,
"rank"
_a
)
.
def
(
"modify_shared_weights"
,
&
AbstractTransformerModel
::
modifySharedWeights
,
py
::
call_guard
<
py
::
gil_scoped_release
>
(),
"device_id"
_a
,
"rank"
_a
)
.
def
(
"get_params"
,
[](
AbstractTransformerModel
*
model
,
int
deviceId
,
int
rank
)
{
...
...
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
View file @
14ad512a
...
...
@@ -187,6 +187,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
quant_policy_
=
reader
.
GetInteger
(
"llama"
,
"quant_policy"
,
0
);
group_size_
=
reader
.
GetInteger
(
"llama"
,
"group_size"
,
0
);
w4_weight_layout_
=
reader
.
GetInteger
(
"llama"
,
"w4_weight_layout"
,
2
);
w4_pad_size_
=
reader
.
GetInteger
(
"llama"
,
"w4_pad_size"
,
2
);
// rotary embedding parameters
attn_params_
.
rotary_embedding_dim
=
reader
.
GetInteger
(
"llama"
,
"rotary_embedding"
);
...
...
@@ -383,6 +384,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
weight_type_
,
group_size_
,
w4_weight_layout_
,
w4_pad_size_
,
tensor_para_size_
,
tensor_para_rank
);
// model inited with model_dir
...
...
@@ -392,6 +394,21 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
return
;
}
template
<
typename
T
>
void
LlamaTritonModel
<
T
>::
modifySharedWeights
(
int
device_id
,
int
rank
)
{
ft
::
check_cuda_error
(
cudaSetDevice
(
device_id
));
const
int
tensor_para_rank
=
rank
%
tensor_para_size_
;
const
int
pipeline_para_rank
=
rank
/
tensor_para_size_
;
ft
::
FT_CHECK
(
pipeline_para_size_
==
1
&&
pipeline_para_rank
==
0
);
if
(
weight_type_
==
turbomind
::
WeightType
::
kINT4
)
{
shared_weights_
[
device_id
]
->
modifyModel
();
}
return
;
}
template
<
typename
T
>
TensorMap
LlamaTritonModel
<
T
>::
getParams
(
int
deviceId
,
int
rank
)
{
...
...
src/turbomind/triton_backend/llama/LlamaTritonModel.h
View file @
14ad512a
...
...
@@ -53,6 +53,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
std
::
shared_ptr
<
ft
::
AbstractCustomComm
>
custom_all_reduce_comm
=
nullptr
)
override
;
void
createSharedWeights
(
int
deviceId
,
int
rank
)
override
;
void
modifySharedWeights
(
int
deviceId
,
int
rank
)
override
;
TensorMap
getParams
(
int
deviceId
,
int
rank
)
override
;
...
...
@@ -102,6 +103,7 @@ private:
int
quant_policy_
;
int
group_size_
;
int
w4_weight_layout_
;
int
w4_pad_size_
;
// shared weights for each device
std
::
vector
<
std
::
shared_ptr
<
ft
::
LlamaWeight
<
T
>>>
shared_weights_
;
...
...
src/turbomind/triton_backend/transformer_triton_backend.hpp
View file @
14ad512a
...
...
@@ -325,6 +325,7 @@ struct AbstractTransformerModel {
std
::
shared_ptr
<
ft
::
AbstractCustomComm
>
custom_all_reduce_comm
=
nullptr
)
=
0
;
virtual
void
createSharedWeights
(
int
deviceId
,
int
rank
)
=
0
;
virtual
void
modifySharedWeights
(
int
deviceId
,
int
rank
)
=
0
;
virtual
TensorMap
getParams
(
int
deviceId
,
int
rank
)
=
0
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment