Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
14ad512a
Commit
14ad512a
authored
Jul 01, 2024
by
gaoqiong
Browse files
增加awq 多卡支持
parent
6ba90df9
Changes
21
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1140 additions
and
157 deletions
+1140
-157
3rdparty/gpufusion/nccl.h
3rdparty/gpufusion/nccl.h
+805
-0
README.md
README.md
+41
-16
lmdeploy/cli/cli.py
lmdeploy/cli/cli.py
+5
-0
lmdeploy/turbomind/deploy/converter.py
lmdeploy/turbomind/deploy/converter.py
+6
-1
lmdeploy/turbomind/deploy/target_model/base.py
lmdeploy/turbomind/deploy/target_model/base.py
+3
-4
lmdeploy/turbomind/deploy/target_model/w4.py
lmdeploy/turbomind/deploy/target_model/w4.py
+12
-76
lmdeploy/turbomind/turbomind.py
lmdeploy/turbomind/turbomind.py
+34
-7
lmdeploy/version.py
lmdeploy/version.py
+1
-1
src/turbomind/kernels/gemm_s_f16/format.cu
src/turbomind/kernels/gemm_s_f16/format.cu
+64
-3
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
+5
-0
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+101
-32
src/turbomind/models/llama/LlamaDecoderLayerWeight.h
src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+2
-0
src/turbomind/models/llama/LlamaDenseWeight.h
src/turbomind/models/llama/LlamaDenseWeight.h
+1
-0
src/turbomind/models/llama/LlamaLinear.h
src/turbomind/models/llama/LlamaLinear.h
+15
-16
src/turbomind/models/llama/LlamaWeight.cc
src/turbomind/models/llama/LlamaWeight.cc
+18
-1
src/turbomind/models/llama/LlamaWeight.h
src/turbomind/models/llama/LlamaWeight.h
+2
-0
src/turbomind/python/bind.cpp
src/turbomind/python/bind.cpp
+5
-0
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+17
-0
src/turbomind/triton_backend/llama/LlamaTritonModel.h
src/turbomind/triton_backend/llama/LlamaTritonModel.h
+2
-0
src/turbomind/triton_backend/transformer_triton_backend.hpp
src/turbomind/triton_backend/transformer_triton_backend.hpp
+1
-0
No files found.
3rdparty/gpufusion/nccl.h
0 → 100644
View file @
14ad512a
This diff is collapsed.
Click to expand it.
README.md
View file @
14ad512a
...
@@ -67,7 +67,9 @@ yum install rapidjson
...
@@ -67,7 +67,9 @@ yum install rapidjson
export NCCL_LIB_DIR=/opt/dtk/cuda/lib64
export NCCL_LIB_DIR=/opt/dtk/cuda/lib64
pip3 install -r requirements.txt
pip3 install -r requirements.txt
pip3 install urllib3==1.24
pip3 install urllib3==1.24
#apt-get 换源,添加清华源
apt-get install rapidjson-dev
#若安装不上则需要apt-get 换源,添加清华源
#添加清华源后更新
#添加清华源后更新
#vim /etc/apt/sources.list
#vim /etc/apt/sources.list
#添加清华源如下:
#添加清华源如下:
...
@@ -75,17 +77,17 @@ pip3 install urllib3==1.24
...
@@ -75,17 +77,17 @@ pip3 install urllib3==1.24
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
#换源完成后进行更新
#换源完成后进行更新再重新安装
sudo apt-get update
#sudo apt-get update
apt-get install rapidjson-dev
# 执行nccl环境变量
# 执行nccl环境变量
export NCCL_LAUNCH_MODE=GROUP
export NCCL_LAUNCH_MODE=GROUP
```
```
注:
注:
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
2、gpufusion wget指令提供的网址可能会有变化,可以进入提供网页下载对应压缩工具包
2、gpufusion wget指令提供的网址可能会有变化,可以进入提供网页下载对应压缩工具包
3、若使用DTK24041 pytorch镜像中进行编译,其中镜像中dtk自带有gpufusion文件,在此项目编译过程总需要更换其中一个文件,lmdeploy/3rdparty/gpufusion/nccl.h 放入 /opt/dtk/cuda/include 路径下
#### 源码编译安装
#### 源码编译安装
-
代码下载
-
代码下载
...
@@ -113,13 +115,16 @@ cd dist && pip3 install lmdeploy*
...
@@ -113,13 +115,16 @@ cd dist && pip3 install lmdeploy*
### 模型转换
### 模型转换
```
bash
```
bash
# <model_name> 模型的名字 (
'llama', 'internlm
', 'vicuna', 'wizardl
M
', 'internlm-chat-7b', 'internlm-chat', 'internlm-
chat-7b-8k
', 'internlm-chat
-20b
', 'internlm
-20b
', 'baichuan-7b', 'baichuan2-7b', 'puyu', 'llama2', '
qwen-7b
', 'qwen
-14b
', 'qwen-7
2
b',
'codellama
', 'solar', 'ultralm', 'ultracm', 'yi')
# <model_name> 模型的名字 (
['base', 'llama
', 'vicuna', 'wizardl
m', 'internlm
', 'internlm-
cha-
chat-7b
-8k
', 'internlm-chat
-20b
', 'internlm-
20b', 'internlm2-1_8b', 'internlm2-7b', 'internlm2-20b', 'internlm2
', 'internlm
2
-chat', 'internlm
2-cinternlm2-chat-20b', 'baichuan-base
', 'baichuan-7b',
'baichuan2',
'baichuan2-7b', 'puyu', 'llama2', '
llama-2', 'llama-2-chat
', 'qwen', 'qwen-7b',
ma', 'falcon', 'chatglm', 'chatglm2-6b', 'solar
', 'solar
-70b
', 'ultralm', 'ultracm', 'yi'
, 'yi-chat', 'yi-200k', 'yi-34b', 'Mistral-7B-Instruct',l', 'mixtral', 'gemma', 'deepseek', 'deepseek-chat', 'yi-vl']
)
# <model_path> 模型路径
# <model_path> 模型路径
# <model_format> 模型的格式 ('
llama
', 'hf',
None。可以不写默认None,代码会根据模型选择格式,一般选择不写
)
# <model_format> 模型的格式 ('
awq
', 'hf',
'llama'
)
# <
model_form
at> 保存输出的目标路径(默认./workspace)
# <
dst_p
at
h
> 保存输出的目标路径(默认./workspace)
# <tp> 用于张量并行的GPU数量应该是2^n
# <tp> 用于张量并行的GPU数量应该是2^n
# <quant_model_path> AWQ量化模型
#若采用fp16模型
lmdeploy convert
${
model_name
}
${
model_path
}
--model-format
${
model_format
}
--dst-path
${
dst_path
}
--tp
${
tp
}
lmdeploy convert
${
model_name
}
${
model_path
}
--model-format
${
model_format
}
--dst-path
${
dst_path
}
--tp
${
tp
}
#若采用AWQ模型
lmdeploy convert
${
model_name
}
${
quant_model_path
}
--model-format
awq
--group-size
128
--tp
${
tp
}
--dst-path
${
dst_path
}
```
```
### 运行
### 运行
#### bash界面运行
#### bash界面运行
...
@@ -155,21 +160,41 @@ api-server的详细使用可以参照[这里](docs/zh_cn/serving)的文档
...
@@ -155,21 +160,41 @@ api-server的详细使用可以参照[这里](docs/zh_cn/serving)的文档
codellama模型的部署可以参照
[
codellama
](
docs/zh_cn/supported_models/codellama.md
)
codellama模型的部署可以参照
[
codellama
](
docs/zh_cn/supported_models/codellama.md
)
##
#
AWQ 量化推理
## AWQ 量化推理
本版本支持量化推理功能,步骤如下:
本版本支持量化推理功能,步骤如下:
```
bash
```
bash
#group_size:按照模型量化时候的分组参数,一般为128
#采用数据量化
#可以根据需求采用需要的数据集进行量化,以下以c4作为数据集进行量化示例
#修改lmdeploy/lmdeploy/lite/utils/calib_dataloader.py get_c4()函数,更改为本地数据集路径
lmdeploy lite auto_awq
${
model_path
}
--calib-dataset
'c4'
--calib-samples
128
--calib-seqlen
2048
--w-bits
4
--w-group-size
128
--work-dir
${
quant_model_path
}
#group_size:按照模型量化时候的分组参数,仅支持128
#<tp> 用于张量并行的GPU数量应该是2^n
#<tp> 用于张量并行的GPU数量应该是2^n
#<dst-path> 保存模型的目标文件夹
#<dst-path> 保存模型的目标文件夹
#step1:模型转换:
#step1:模型转换:
lmdeploy convert
${
model_name
}
${
model_path
}
--model
_
format
awq
--group-size
${
group_size
}
--tp
${
tp
}
--dst-path
${
dst_path
}
lmdeploy convert
${
model_name
}
${
quant_
model_path
}
--model
-
format
awq
--group-size
128
--tp
${
tp
}
--dst-path
${
dst_path
}
#step1:模型运行
#step1:模型运行
lmdeploy chat turbomind
${
dst_path
}
--tp
${
tp
}
lmdeploy chat turbomind
${
dst_path
}
--tp
${
tp
}
```
```
注意事项:
注意事项:
1.
该版本暂时仅支持tp=1 单卡量化推理,仅支持卡型KM-AI,暂不支持K100/Z100/Z100L;
1.
该版本仅支持卡型KM-AI,暂不支持K100/Z100/Z100L;
2.
该版本量化推理功能仅支持先通过convert模型转换为turbomind格式,然后进行推理运行,暂时不知道hf模型直接量化推理;
2.
在进行benchmark测评时,AWQ模型不支持使用hf 模型直接进行评测,推荐先使用工具将量化模型转换为turbomind格式,且执行的tp数据需和模型转换时的tp指定数量一致;
3.
该版本暂时不支持通过数据集进行量化功能,需要在别处获取量化模型;
3.
llama2-70b与qwen-72b模型在做数据集量化时,calib-samples参数推荐设置为120;
4.
多卡支持模型列表如下:
| 模型 | AWQ TP=1 | AWQ TP=2 | AWQ TP=4
| :----------: | :------: | :--: | :--: |
| Llama2-7B-chat | Yes | Yes | No |
| Llama2-13B-chat | Yes | Yes | Yes |
| Llama2-70B-chat | Yes | Yes | Yes |
| qwen-7B-chat | Yes | Yes | No |
| qwen-14B-chat | Yes | No | No |
| qwen-72B-chat | Yes | Yes | Yes |
备注:qwen-14b-chat模型不支持多卡AWQ量化推理原因为其中有size为[13696,5120]的gemm,当group_size为128时,scale shape为[13696/128,5120]=[107,5120],107不能被tp=2或者4整除。您可以依据此特点来判断您的模型能都支持AWQ多卡推理。
## result
## result


...
...
lmdeploy/cli/cli.py
View file @
14ad512a
...
@@ -66,6 +66,11 @@ class CLI(object):
...
@@ -66,6 +66,11 @@ class CLI(object):
type
=
int
,
type
=
int
,
default
=
2
,
default
=
2
,
help
=
'A parameter used in AWQ to control the layout of weight '
)
help
=
'A parameter used in AWQ to control the layout of weight '
)
parser
.
add_argument
(
'--w4-pad-size'
,
type
=
int
,
default
=
2
,
help
=
'A parameter used in AWQ to control the pad size of weight '
)
parser
.
set_defaults
(
run
=
CLI
.
convert
)
parser
.
set_defaults
(
run
=
CLI
.
convert
)
@
staticmethod
@
staticmethod
...
...
lmdeploy/turbomind/deploy/converter.py
View file @
14ad512a
...
@@ -197,6 +197,7 @@ def main(model_name: str,
...
@@ -197,6 +197,7 @@ def main(model_name: str,
quant_path
:
str
=
None
,
quant_path
:
str
=
None
,
group_size
:
int
=
0
,
group_size
:
int
=
0
,
w4_weight_layout
:
int
=
2
,
w4_weight_layout
:
int
=
2
,
w4_pad_size
:
int
=
2
,
**
kwargs
):
**
kwargs
):
"""deploy llama family models via turbomind.
"""deploy llama family models via turbomind.
...
@@ -217,6 +218,7 @@ def main(model_name: str,
...
@@ -217,6 +218,7 @@ def main(model_name: str,
group_size (int): a parameter used in AWQ to quantize fp16 weights
group_size (int): a parameter used in AWQ to quantize fp16 weights
to 4 bits
to 4 bits
w4_weight_layout (int) :a parameter used in AWQ to control the layout of weight
w4_weight_layout (int) :a parameter used in AWQ to control the layout of weight
w4_pad_size(int): a parameter used in AWQ to control the layout of weight
kwargs (dict): other params for convert
kwargs (dict): other params for convert
"""
"""
...
@@ -263,12 +265,15 @@ def main(model_name: str,
...
@@ -263,12 +265,15 @@ def main(model_name: str,
cfg
.
rotary_embedding
=
cfg
.
size_per_head
cfg
.
rotary_embedding
=
cfg
.
size_per_head
cfg
.
group_size
=
group_size
cfg
.
group_size
=
group_size
cfg
.
w4_weight_layout
=
w4_weight_layout
cfg
.
w4_weight_layout
=
w4_weight_layout
cfg
.
w4_pad_size
=
w4_pad_size
if
inferred_model_format
.
find
(
'awq'
)
!=
-
1
:
if
inferred_model_format
.
find
(
'awq'
)
!=
-
1
:
cfg
.
weight_type
=
'int4'
cfg
.
weight_type
=
'int4'
output_format
=
'w4'
output_format
=
'w4'
assert
group_size
>
0
,
f
'group_size:
{
group_size
}
should > 0'
assert
group_size
>
0
,
f
'group_size:
{
group_size
}
should > 0'
print
(
"w4_weight_layout:"
,
w4_weight_layout
)
#
print("w4_weight_layout:",w4_weight_layout)
assert
w4_weight_layout
>=
0
and
w4_weight_layout
<
3
,
f
'w4_weight_layout:
{
w4_weight_layout
}
should >= 0 and < 3'
assert
w4_weight_layout
>=
0
and
w4_weight_layout
<
3
,
f
'w4_weight_layout:
{
w4_weight_layout
}
should >= 0 and < 3'
assert
w4_pad_size
>=
0
and
w4_pad_size
<
5
,
f
'w4_pad_size should >= 0 and <5'
else
:
else
:
#output_format = update_output_format(model_name, inferred_model_format,
#output_format = update_output_format(model_name, inferred_model_format,
# model_path, output_format)
# model_path, output_format)
...
...
lmdeploy/turbomind/deploy/target_model/base.py
View file @
14ad512a
...
@@ -54,6 +54,7 @@ class TurbomindModelConfig:
...
@@ -54,6 +54,7 @@ class TurbomindModelConfig:
size_per_head
:
int
=
128
size_per_head
:
int
=
128
group_size
:
int
=
0
group_size
:
int
=
0
w4_weight_layout
:
int
=
2
w4_weight_layout
:
int
=
2
w4_pad_size
:
int
=
2
max_batch_size
:
int
=
64
max_batch_size
:
int
=
64
max_context_token_num
:
int
=
1
max_context_token_num
:
int
=
1
step_length
:
int
=
1
step_length
:
int
=
1
...
@@ -208,6 +209,7 @@ class BaseOutputModel(ABC):
...
@@ -208,6 +209,7 @@ class BaseOutputModel(ABC):
param
=
param
.
to
(
torch_type
)
param
=
param
.
to
(
torch_type
)
tprint
(
name
,
param
.
shape
)
tprint
(
name
,
param
.
shape
)
_tofile
(
param
,
osp
.
join
(
self
.
out_dir
,
name
))
_tofile
(
param
,
osp
.
join
(
self
.
out_dir
,
name
))
elif
len
(
self
.
tm_params
)
>
0
:
elif
len
(
self
.
tm_params
)
>
0
:
tm_params
=
self
.
tm_params
tm_params
=
self
.
tm_params
weight_type
=
self
.
cfg
.
weight_type
weight_type
=
self
.
cfg
.
weight_type
...
@@ -228,6 +230,7 @@ class BaseOutputModel(ABC):
...
@@ -228,6 +230,7 @@ class BaseOutputModel(ABC):
torch_tensor
=
torch_tensor
.
float
()
torch_tensor
=
torch_tensor
.
float
()
for
tm_tensor
in
tm_params
[
name
]:
for
tm_tensor
in
tm_params
[
name
]:
tm_tensor
.
copy_from
(
torch_tensor
)
tm_tensor
.
copy_from
(
torch_tensor
)
tm_params
.
pop
(
name
)
tm_params
.
pop
(
name
)
else
:
else
:
tprint
(
'skip export'
,
name
,
param
.
shape
)
tprint
(
'skip export'
,
name
,
param
.
shape
)
...
@@ -325,10 +328,6 @@ def permute(x: torch.Tensor, size_per_head: int = 128):
...
@@ -325,10 +328,6 @@ def permute(x: torch.Tensor, size_per_head: int = 128):
return
x
.
view
(
n_heads
,
2
,
dim
//
n_heads
//
2
,
return
x
.
view
(
n_heads
,
2
,
dim
//
n_heads
//
2
,
1
).
transpose
(
1
,
2
).
reshape
(
dim
,
1
)
1
).
transpose
(
1
,
2
).
reshape
(
dim
,
1
)
def
permute_trans
(
x
:
torch
.
Tensor
):
if
x
.
shape
[
-
1
]
>
1
:
dim
=
x
.
shape
[
-
1
]
return
x
.
view
(
-
1
,
x
.
shape
[
-
1
]).
transpose
(
0
,
1
).
reshape
(
dim
,
-
1
)
def
merge_qkv
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
tp
:
int
,
def
merge_qkv
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
tp
:
int
,
dim
:
int
):
dim
:
int
):
...
...
lmdeploy/turbomind/deploy/target_model/w4.py
View file @
14ad512a
...
@@ -8,7 +8,7 @@ import lmdeploy
...
@@ -8,7 +8,7 @@ import lmdeploy
from
..source_model.base
import
BaseInputModel
,
BaseReader
from
..source_model.base
import
BaseInputModel
,
BaseReader
from
.base
import
(
OUTPUT_MODELS
,
BaseOutputModel
,
TurbomindModelConfig
,
from
.base
import
(
OUTPUT_MODELS
,
BaseOutputModel
,
TurbomindModelConfig
,
merge_qkv
,
permute
,
permute_trans
)
merge_qkv
,
permute
)
# import _turbomind as _tm
# import _turbomind as _tm
# TODO: find another way import _turbomind
# TODO: find another way import _turbomind
...
@@ -117,6 +117,7 @@ class TurbomindW4Model(BaseOutputModel):
...
@@ -117,6 +117,7 @@ class TurbomindW4Model(BaseOutputModel):
group_size
=
self
.
cfg
.
group_size
group_size
=
self
.
cfg
.
group_size
tp
=
self
.
cfg
.
tensor_para_size
tp
=
self
.
cfg
.
tensor_para_size
w4_weight_layout
=
self
.
cfg
.
w4_weight_layout
w4_weight_layout
=
self
.
cfg
.
w4_weight_layout
w4_pad_size
=
self
.
cfg
.
w4_pad_size
size_per_head
=
self
.
cfg
.
size_per_head
size_per_head
=
self
.
cfg
.
size_per_head
# attn
# attn
q_qw
,
k_qw
,
v_qw
,
o_qw
=
get_cuda_tensor
(
bin
.
attn
(
i
))
q_qw
,
k_qw
,
v_qw
,
o_qw
=
get_cuda_tensor
(
bin
.
attn
(
i
))
...
@@ -134,48 +135,15 @@ class TurbomindW4Model(BaseOutputModel):
...
@@ -134,48 +135,15 @@ class TurbomindW4Model(BaseOutputModel):
qkv_qz
=
merge_qkv
(
q_qz
,
k_qz
,
v_qz
,
tp
,
dim
=
2
)
qkv_qz
=
merge_qkv
(
q_qz
,
k_qz
,
v_qz
,
tp
,
dim
=
2
)
qkv_s
=
merge_qkv
(
q_s
,
k_s
,
v_s
,
tp
,
dim
=
2
)
qkv_s
=
merge_qkv
(
q_s
,
k_s
,
v_s
,
tp
,
dim
=
2
)
pad_group_count
=
2
qkv_qw
,
qkv_sz
=
convert_s4
(
qkv_qw
,
qkv_qz
,
qkv_s
,
group_size
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
if
qkv_qw
.
shape
[
0
]
%
4096
==
0
:
qkv_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
qkv_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
qkv_qw
=
torch
.
cat
((
qkv_qw
,
qkv_qw_padding
),
dim
=
0
).
contiguous
()
qkv_qz_padding
=
torch
.
zeros
(
pad_group_count
,
qkv_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
qkv_qz
=
torch
.
cat
((
qkv_qz
,
qkv_qz_padding
),
dim
=
0
).
contiguous
()
qkv_s_padding
=
torch
.
zeros
(
pad_group_count
,
qkv_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
qkv_s
=
torch
.
cat
((
qkv_s
,
qkv_s_padding
),
dim
=
0
).
contiguous
()
qkv_qw
,
qkv_sz
=
convert_s4_
(
qkv_qw
,
qkv_qz
,
qkv_s
,
group_size
)
qkv_qw
=
tp_m_s4
(
qkv_qw
,
tp
)
qkv_sz
=
permute_trans
(
qkv_sz
)
else
:
qkv_qw
,
qkv_sz
=
convert_s4
(
qkv_qw
,
qkv_qz
,
qkv_s
,
group_size
)
qkv_qw
=
tp_m_s4
(
qkv_qw
,
tp
)
#print("请设置weight layout\n")
self
.
save_split
(
qkv_qw
,
f
'layers.
{
i
}
.attention.w_qkv.qweight'
,
-
1
)
self
.
save_split
(
qkv_qw
,
f
'layers.
{
i
}
.attention.w_qkv.qweight'
,
-
1
)
self
.
save_split
(
qkv_sz
,
f
'layers.
{
i
}
.attention.w_qkv.scales_zeros'
,
-
1
)
self
.
save_split
(
qkv_sz
,
f
'layers.
{
i
}
.attention.w_qkv.scales_zeros'
,
-
1
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
o_qw
,
o_sz
=
convert_s4
(
o_qw
,
o_qz
,
o_s
,
group_size
)
if
o_qw
.
shape
[
0
]
%
4096
==
0
:
o_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
o_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
o_qw
=
torch
.
cat
((
o_qw
,
o_qw_padding
),
dim
=
0
).
contiguous
()
o_qz_padding
=
torch
.
zeros
(
pad_group_count
,
o_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
o_qz
=
torch
.
cat
((
o_qz
,
o_qz_padding
),
dim
=
0
).
contiguous
()
o_s_padding
=
torch
.
zeros
(
pad_group_count
,
o_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
o_s
=
torch
.
cat
((
o_s
,
o_s_padding
),
dim
=
0
).
contiguous
()
o_qw
,
o_sz
=
convert_s4_
(
o_qw
,
o_qz
,
o_s
,
group_size
)
o_sz
=
permute_trans
(
o_sz
)
else
:
o_qw
,
o_sz
=
convert_s4
(
o_qw
,
o_qz
,
o_s
,
group_size
)
self
.
save_split
(
o_qw
,
f
'layers.
{
i
}
.attention.wo.qweight'
,
0
)
self
.
save_split
(
o_qw
,
f
'layers.
{
i
}
.attention.wo.qweight'
,
0
)
self
.
save_split
(
o_sz
,
f
'layers.
{
i
}
.attention.wo.scales_zeros'
,
0
)
self
.
save_split
(
o_sz
,
f
'layers.
{
i
}
.attention.wo.scales_zeros'
,
0
)
q_b
,
k_b
,
v_b
,
o_b
=
get_cuda_tensor
(
bin
.
attn_bias
(
i
))
q_b
,
k_b
,
v_b
,
o_b
=
get_cuda_tensor
(
bin
.
attn_bias
(
i
))
if
q_b
is
not
None
:
if
q_b
is
not
None
:
q_b
=
permute
(
q_b
,
size_per_head
)
q_b
=
permute
(
q_b
,
size_per_head
)
...
@@ -184,6 +152,7 @@ class TurbomindW4Model(BaseOutputModel):
...
@@ -184,6 +152,7 @@ class TurbomindW4Model(BaseOutputModel):
self
.
save_split
(
qkv_b
,
f
'layers.
{
i
}
.attention.w_qkv.bias'
,
-
1
)
self
.
save_split
(
qkv_b
,
f
'layers.
{
i
}
.attention.w_qkv.bias'
,
-
1
)
self
.
save_split
(
o_b
,
f
'layers.
{
i
}
.attention.wo.bias'
,
copy
=
True
)
self
.
save_split
(
o_b
,
f
'layers.
{
i
}
.attention.wo.bias'
,
copy
=
True
)
# ffn weights
# ffn weights
w1_qw
,
w2_qw
,
w3_qw
=
get_cuda_tensor
(
bin
.
ffn
(
i
))
w1_qw
,
w2_qw
,
w3_qw
=
get_cuda_tensor
(
bin
.
ffn
(
i
))
w1_qz
,
w2_qz
,
w3_qz
=
get_cuda_tensor
(
bin
.
ffn_zero
(
i
))
w1_qz
,
w2_qz
,
w3_qz
=
get_cuda_tensor
(
bin
.
ffn_zero
(
i
))
...
@@ -191,45 +160,12 @@ class TurbomindW4Model(BaseOutputModel):
...
@@ -191,45 +160,12 @@ class TurbomindW4Model(BaseOutputModel):
w13_qw
,
w13_qz
,
w13_s
=
fuse_w1_w3_s4
(
w1_qw
,
w1_qz
,
w1_s
,
w3_qw
,
w3_qz
,
w13_qw
,
w13_qz
,
w13_s
=
fuse_w1_w3_s4
(
w1_qw
,
w1_qz
,
w1_s
,
w3_qw
,
w3_qz
,
w3_s
)
w3_s
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
w13_qw
,
w13_sz
=
convert_s4
(
w13_qw
,
w13_qz
,
w13_s
,
group_size
)
if
w13_qw
.
shape
[
0
]
%
4096
==
0
:
w13_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
w13_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w13_qw
=
torch
.
cat
((
w13_qw
,
w13_qw_padding
),
dim
=
0
).
contiguous
()
w13_qz_padding
=
torch
.
zeros
(
pad_group_count
,
w13_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w13_qz
=
torch
.
cat
((
w13_qz
,
w13_qz_padding
),
dim
=
0
).
contiguous
()
w13_s_padding
=
torch
.
zeros
(
pad_group_count
,
w13_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
w13_s
=
torch
.
cat
((
w13_s
,
w13_s_padding
),
dim
=
0
).
contiguous
()
w13_qw
,
w13_sz
=
convert_s4_
(
w13_qw
,
w13_qz
,
w13_s
,
group_size
)
w13_qw
=
tp_m_s4
(
w13_qw
,
tp
)
w13_sz
=
permute_trans
(
w13_sz
)
else
:
w13_qw
,
w13_sz
=
convert_s4
(
w13_qw
,
w13_qz
,
w13_s
,
group_size
)
w13_qw
=
tp_m_s4
(
w13_qw
,
tp
)
self
.
save_split
(
w13_qw
,
f
'layers.
{
i
}
.feed_forward.w13.qweight'
,
-
1
)
self
.
save_split
(
w13_qw
,
f
'layers.
{
i
}
.feed_forward.w13.qweight'
,
-
1
)
self
.
save_split
(
w13_sz
,
f
'layers.
{
i
}
.feed_forward.w13.scales_zeros'
,
self
.
save_split
(
w13_sz
,
f
'layers.
{
i
}
.feed_forward.w13.scales_zeros'
,
-
1
)
-
1
)
w2_qw
,
w2_sz
=
convert_s4
(
w2_qw
,
w2_qz
,
w2_s
,
group_size
)
if
w4_weight_layout
==
1
or
w4_weight_layout
==
2
:
#pading
if
w2_qw
.
shape
[
0
]
%
4096
==
0
:
w2_qw_padding
=
torch
.
zeros
(
group_size
*
pad_group_count
,
w2_qw
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w2_qw
=
torch
.
cat
((
w2_qw
,
w2_qw_padding
),
dim
=
0
).
contiguous
()
w2_qz_padding
=
torch
.
zeros
(
pad_group_count
,
w2_qz
.
shape
[
1
],
dtype
=
torch
.
int32
).
cuda
()
w2_qz
=
torch
.
cat
((
w2_qz
,
w2_qz_padding
),
dim
=
0
).
contiguous
()
w2_s_padding
=
torch
.
zeros
(
pad_group_count
,
w2_s
.
shape
[
1
],
dtype
=
torch
.
float16
).
cuda
()
w2_s
=
torch
.
cat
((
w2_s
,
w2_s_padding
),
dim
=
0
).
contiguous
()
w2_qw
,
w2_sz
=
convert_s4_
(
w2_qw
,
w2_qz
,
w2_s
,
group_size
)
w2_sz
=
permute_trans
(
w2_sz
)
else
:
w2_qw
,
w2_sz
=
convert_s4
(
w2_qw
,
w2_qz
,
w2_s
,
group_size
)
self
.
save_split
(
w2_qw
,
f
'layers.
{
i
}
.feed_forward.w2.qweight'
,
0
)
self
.
save_split
(
w2_qw
,
f
'layers.
{
i
}
.feed_forward.w2.qweight'
,
0
)
self
.
save_split
(
w2_sz
,
f
'layers.
{
i
}
.feed_forward.w2.scales_zeros'
,
0
)
self
.
save_split
(
w2_sz
,
f
'layers.
{
i
}
.feed_forward.w2.scales_zeros'
,
0
)
...
...
lmdeploy/turbomind/turbomind.py
View file @
14ad512a
...
@@ -148,6 +148,7 @@ class TurboMind:
...
@@ -148,6 +148,7 @@ class TurboMind:
model_format
:
Optional
[
str
]
=
None
,
model_format
:
Optional
[
str
]
=
None
,
group_size
:
Optional
[
int
]
=
None
,
group_size
:
Optional
[
int
]
=
None
,
w4_weight_layout
:
Optional
[
int
]
=
None
,
w4_weight_layout
:
Optional
[
int
]
=
None
,
w4_pad_size
:
Optional
[
int
]
=
None
,
tp
:
Optional
[
int
]
=
None
,
tp
:
Optional
[
int
]
=
None
,
chat_template_config
:
Optional
[
ChatTemplateConfig
]
=
None
,
chat_template_config
:
Optional
[
ChatTemplateConfig
]
=
None
,
**
kwargs
):
**
kwargs
):
...
@@ -181,6 +182,7 @@ class TurboMind:
...
@@ -181,6 +182,7 @@ class TurboMind:
model_format
=
model_format
,
model_format
=
model_format
,
group_size
=
group_size
,
group_size
=
group_size
,
w4_weight_layout
=
w4_weight_layout
,
w4_weight_layout
=
w4_weight_layout
,
w4_pad_size
=
w4_pad_size
,
tp
=
tp
,
tp
=
tp
,
**
kwargs
)
**
kwargs
)
...
@@ -237,6 +239,28 @@ class TurboMind:
...
@@ -237,6 +239,28 @@ class TurboMind:
threads
.
append
(
t
)
threads
.
append
(
t
)
for
t
in
threads
:
for
t
in
threads
:
t
.
join
()
t
.
join
()
def
_modify_weight
(
self
,
model_comm
):
"""modify weight if from_hf with awq."""
# TODO: support mpi
self
.
node_id
=
0
self
.
node_num
=
1
self
.
nccl_params
=
model_comm
.
create_nccl_params
(
self
.
node_id
)
torch
.
cuda
.
synchronize
()
def
_modify_weight_func
(
device_id
):
with
cuda_ctx
(
device_id
):
rank
=
self
.
node_id
*
self
.
gpu_count
+
device_id
model_comm
.
modify_shared_weights
(
device_id
,
rank
)
threads
=
[]
for
device_id
in
range
(
self
.
gpu_count
):
t
=
Thread
(
target
=
_modify_weight_func
,
args
=
(
device_id
,
))
t
.
start
()
threads
.
append
(
t
)
for
t
in
threads
:
t
.
join
()
def
_load_kv_qparams
(
self
,
model_path
,
tm_params
,
**
kwargs
):
def
_load_kv_qparams
(
self
,
model_path
,
tm_params
,
**
kwargs
):
"""Load kv qparams when loading from hf."""
"""Load kv qparams when loading from hf."""
...
@@ -271,10 +295,10 @@ class TurboMind:
...
@@ -271,10 +295,10 @@ class TurboMind:
t
.
join
()
t
.
join
()
for
_
in
range
(
self
.
gpu_count
):
for
_
in
range
(
self
.
gpu_count
):
tensor_map
=
que
.
get
()
tensor_map
=
que
.
get
()
for
k
,
v
in
tensor_map
.
items
():
for
k
,
v
in
tensor_map
.
items
():
if
k
not
in
tm_params
:
if
k
not
in
tm_params
:
tm_params
[
k
]
=
[]
tm_params
[
k
]
=
[]
tm_params
[
k
].
append
(
v
)
tm_params
[
k
].
append
(
v
)
def
_from_hf
(
self
,
model_source
:
ModelSource
,
model_path
:
str
,
def
_from_hf
(
self
,
model_source
:
ModelSource
,
model_path
:
str
,
...
@@ -307,10 +331,11 @@ class TurboMind:
...
@@ -307,10 +331,11 @@ class TurboMind:
data_type
=
'int4'
data_type
=
'int4'
cfg
.
group_size
=
128
cfg
.
group_size
=
128
cfg
.
w4_weight_layout
=
2
cfg
.
w4_weight_layout
=
2
cfg
.
w4_pad_size
=
0
else
:
else
:
#
output_format = update_output_format(cfg.model_name,
output_format
=
update_output_format
(
cfg
.
model_name
,
#
inferred_model_format,
inferred_model_format
,
#
model_path, output_format)
model_path
,
output_format
)
data_type
=
output_format
data_type
=
output_format
update_config_weight_type
(
output_format
,
cfg
)
update_config_weight_type
(
output_format
,
cfg
)
...
@@ -342,12 +367,15 @@ class TurboMind:
...
@@ -342,12 +367,15 @@ class TurboMind:
# copy hf model weight to turbomind weight
# copy hf model weight to turbomind weight
tm_params
=
output_model
.
tm_params
tm_params
=
output_model
.
tm_params
self
.
_get_model_params
(
model_comm
,
tm_params
)
self
.
_get_model_params
(
model_comm
,
tm_params
)
logger
.
warning
(
f
'get
{
len
(
tm_params
)
}
model params'
)
logger
.
warning
(
f
'get
{
len
(
tm_params
)
}
model params'
)
output_model
.
export
()
output_model
.
export
()
self
.
_modify_weight
(
model_comm
)
# load kv qparams
# load kv qparams
self
.
_load_kv_qparams
(
model_path
,
tm_params
,
kv_sym
=
False
,
kv_bits
=
8
)
self
.
_load_kv_qparams
(
model_path
,
tm_params
,
kv_sym
=
False
,
kv_bits
=
8
)
assert
len
(
tm_params
)
==
0
,
f
'missing
{
tm_params
.
keys
()
}
'
assert
len
(
tm_params
)
==
0
,
f
'missing
{
tm_params
.
keys
()
}
'
return
model_comm
return
model_comm
...
@@ -381,7 +409,6 @@ class TurboMind:
...
@@ -381,7 +409,6 @@ class TurboMind:
self
.
config
=
cfg
self
.
config
=
cfg
self
.
model_name
=
cfg
.
model_name
self
.
model_name
=
cfg
.
model_name
self
.
data_type
=
cfg
.
weight_type
self
.
data_type
=
cfg
.
weight_type
#print("from_workspace_cfg:",cfg)
# create model
# create model
logger
.
warning
(
f
'model_config:
\n\n
{
cfg
.
toini
()
}
'
)
logger
.
warning
(
f
'model_config:
\n\n
{
cfg
.
toini
()
}
'
)
...
...
lmdeploy/version.py
View file @
14ad512a
# Copyright (c) OpenMMLab. All rights reserved.
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Tuple
from
typing
import
Tuple
__dcu_version__
=
'0.2.6'
__dcu_version__
=
'0.2.6
+das1.1.git7063377.abi0.dtk2404.torch2.1.0
'
__version__
=
'0.2.6'
__version__
=
'0.2.6'
short_version
=
__version__
short_version
=
__version__
...
...
src/turbomind/kernels/gemm_s_f16/format.cu
View file @
14ad512a
...
@@ -76,15 +76,75 @@ void reformat_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStre
...
@@ -76,15 +76,75 @@ void reformat_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStre
permute_u4
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
><<<
512
,
512
,
0
,
st
>>>
(
dst
,
src
,
shape
);
permute_u4
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
><<<
512
,
512
,
0
,
st
>>>
(
dst
,
src
,
shape
);
}
}
template
<
typename
T
>
void
PrintData
(
cudaStream_t
stream
,
const
T
*
input
,
int
size
)
{
int
input_size
=
size
;
T
*
h_data
;
h_data
=
new
T
[
input_size
];
cudaMemcpy
(
h_data
,
input
,
input_size
*
sizeof
(
T
),
cudaMemcpyDeviceToHost
);
if
constexpr
(
std
::
is_same
<
T
,
half
>::
value
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
"%f "
,
__half2float
(
h_data
[
i
]));
}
}
else
if
constexpr
(
std
::
is_same
<
T
,
half2
>::
value
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
"x:%f y:%f "
,
__half2float
(
h_data
[
i
].
data
[
0
]),
__half2float
(
h_data
[
i
].
data
[
1
]));
}
}
else
if
constexpr
(
std
::
is_same
<
T
,
uint32_t
>::
value
)
{
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
printf
(
" %u "
,
h_data
[
i
]);
}
}
printf
(
"
\n
"
);
delete
[]
h_data
;
}
#define INSTANTIATEPRINTDATA(T) \
template void PrintData(cudaStream_t stream, const T* input,int size);
INSTANTIATEPRINTDATA
(
__half
)
INSTANTIATEPRINTDATA
(
float
)
INSTANTIATEPRINTDATA
(
half2
)
INSTANTIATEPRINTDATA
(
uint32_t
)
void
reformat_s4_k_m8_tarnsw4
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
)
void
reformat_s4_k_m8_tarnsw4
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
)
{
{
// permutation for [k, m/8] layout
Array
<
int
,
10
>
shape
{
1
,
k
/
8
,
2
,
2
,
2
,
1
,
m
/
8
,
2
,
2
,
2
};
Array
<
int
,
10
>
shape
{
1
,
k
/
8
,
2
,
2
,
2
,
1
,
m
/
8
,
2
,
2
,
2
};
// 0123456-->4,6,7,5,0,3,1,2
// 0123456-->4,6,7,5,0,3,1,2
//permute_u4<4, 6, 7, 5, 0, 3, 1, 2><<<512, 512, 0, st>>>(dst, src, shape);
//permute_u4<4, 6, 7, 5, 0, 3, 1, 2><<<512, 512, 0, st>>>(dst, src, shape);
permute_u4
<
5
,
6
,
8
,
9
,
7
,
0
,
1
,
4
,
2
,
3
><<<
512
,
512
,
0
,
st
>>>
(
dst
,
src
,
shape
);
permute_u4
<
5
,
6
,
8
,
9
,
7
,
0
,
1
,
4
,
2
,
3
><<<
512
,
512
,
0
,
st
>>>
(
dst
,
src
,
shape
);
}
}
__global__
void
permute_u32
(
int
num_kernels
,
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
)
{
//[k,m]-->[m,k]
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
num_kernels
)
return
;
int
j
=
id
%
k
;
int
i
=
id
/
k
;
dst
[
id
]
=
src
[
j
*
m
+
i
];
}
void
reformat_s4_k_m8_tarnsscale
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
)
{
// permutation for [k, m] layout
int
num_kernels
=
k
*
m
;
permute_u32
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
st
>>>
(
num_kernels
,
dst
,
src
,
m
,
k
);
}
__global__
void
dequantize_s4_offset_64
(
uint4
*
dst
,
const
uint32_t
*
src
,
size_t
count
)
__global__
void
dequantize_s4_offset_64
(
uint4
*
dst
,
const
uint32_t
*
src
,
size_t
count
)
{
{
for
(
int
i
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
i
<
count
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
i
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
i
<
count
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
...
@@ -269,8 +329,7 @@ __global__ void input_padding_kernel(int num_kernels,T* output,const T* input,in
...
@@ -269,8 +329,7 @@ __global__ void input_padding_kernel(int num_kernels,T* output,const T* input,in
template
<
typename
T
>
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
)
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
)
{
{
//input的size是[m,k],output的size是[m,n+group_size]
//
int
num_kernels
=
m
*
(
k
+
pad_groupcount
*
group_size
);
int
num_kernels
=
m
*
(
k
+
pad_groupcount
*
group_size
);
input_padding_kernel
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
stream
>>>
(
num_kernels
,
output
,
input
,
m
,
k
,
group_size
,
pad_groupcount
);
input_padding_kernel
<<<
(
num_kernels
+
BLOCKSIZE
-
1
)
/
BLOCKSIZE
,
BLOCKSIZE
,
0
,
stream
>>>
(
num_kernels
,
output
,
input
,
m
,
k
,
group_size
,
pad_groupcount
);
}
}
...
@@ -282,3 +341,5 @@ template void input_padding(cudaStream_t stream, T* output,const T* input,int m,
...
@@ -282,3 +341,5 @@ template void input_padding(cudaStream_t stream, T* output,const T* input,int m,
INSTANTIATEINPUTPADING
(
__half
)
INSTANTIATEINPUTPADING
(
__half
)
}
// namespace turbomind
}
// namespace turbomind
src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h
View file @
14ad512a
...
@@ -36,6 +36,11 @@ void dequant_w4_gemm_colmajor(cudaStream_t stream, half* output,const uint32_t*
...
@@ -36,6 +36,11 @@ void dequant_w4_gemm_colmajor(cudaStream_t stream, half* output,const uint32_t*
template
<
typename
T
>
template
<
typename
T
>
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
);
void
input_padding
(
cudaStream_t
stream
,
T
*
output
,
const
T
*
input
,
int
m
,
int
k
,
int
group_size
,
int
pad_groupcount
);
void
reformat_s4_k_m8_tarnsw4
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
);
void
reformat_s4_k_m8_tarnsscale
(
uint32_t
*
dst
,
const
uint32_t
*
src
,
int
m
,
int
k
,
cudaStream_t
st
);
template
<
typename
T
>
void
PrintData
(
cudaStream_t
stream
,
const
T
*
input
,
int
size
);
class
GemmS4F16
{
class
GemmS4F16
{
public:
public:
GemmS4F16
();
GemmS4F16
();
...
...
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
View file @
14ad512a
...
@@ -17,7 +17,7 @@
...
@@ -17,7 +17,7 @@
// Modified from
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/logger.h"
...
@@ -42,6 +42,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
...
@@ -42,6 +42,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
WeightType
weight_type
,
WeightType
weight_type
,
int
group_size
,
int
group_size
,
int
w4_weight_layout
,
int
w4_weight_layout
,
int
w4_pad_size
,
bool
attn_bias
,
bool
attn_bias
,
size_t
tensor_para_size
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
)
:
size_t
tensor_para_rank
)
:
...
@@ -60,36 +61,42 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
...
@@ -60,36 +61,42 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
self_attn_weights
.
qkv
.
type
=
weight_type
;
self_attn_weights
.
qkv
.
type
=
weight_type
;
self_attn_weights
.
qkv
.
group_size
=
group_size
;
self_attn_weights
.
qkv
.
group_size
=
group_size
;
self_attn_weights
.
qkv
.
w4_weight_layout
=
w4_weight_layout
;
self_attn_weights
.
qkv
.
w4_weight_layout
=
w4_weight_layout
;
self_attn_weights
.
qkv
.
w4_pad_size
=
w4_pad_size
;
self_attn_weights
.
output
.
input_dims
=
hidden_units_
/
tensor_para_size_
;
self_attn_weights
.
output
.
input_dims
=
hidden_units_
/
tensor_para_size_
;
self_attn_weights
.
output
.
output_dims
=
hidden_units_
;
self_attn_weights
.
output
.
output_dims
=
hidden_units_
;
self_attn_weights
.
output
.
type
=
weight_type
;
self_attn_weights
.
output
.
type
=
weight_type
;
self_attn_weights
.
output
.
group_size
=
group_size
;
self_attn_weights
.
output
.
group_size
=
group_size
;
self_attn_weights
.
output
.
w4_weight_layout
=
w4_weight_layout
;
self_attn_weights
.
output
.
w4_weight_layout
=
w4_weight_layout
;
self_attn_weights
.
output
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
gating
.
input_dims
=
hidden_units_
;
ffn_weights
.
gating
.
input_dims
=
hidden_units_
;
ffn_weights
.
gating
.
output_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
gating
.
output_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
gating
.
type
=
weight_type
;
ffn_weights
.
gating
.
type
=
weight_type
;
ffn_weights
.
gating
.
group_size
=
group_size
;
ffn_weights
.
gating
.
group_size
=
group_size
;
ffn_weights
.
gating
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
gating
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
gating
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
intermediate
.
input_dims
=
hidden_units_
;
ffn_weights
.
intermediate
.
input_dims
=
hidden_units_
;
ffn_weights
.
intermediate
.
output_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
intermediate
.
output_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
intermediate
.
type
=
weight_type
;
ffn_weights
.
intermediate
.
type
=
weight_type
;
ffn_weights
.
intermediate
.
group_size
=
group_size
;
ffn_weights
.
intermediate
.
group_size
=
group_size
;
ffn_weights
.
intermediate
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
intermediate
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
intermediate
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
fused_gating_intermediate
.
input_dims
=
hidden_units_
;
ffn_weights
.
fused_gating_intermediate
.
input_dims
=
hidden_units_
;
ffn_weights
.
fused_gating_intermediate
.
output_dims
=
inter_size_
/
tensor_para_size_
*
2
;
ffn_weights
.
fused_gating_intermediate
.
output_dims
=
inter_size_
/
tensor_para_size_
*
2
;
ffn_weights
.
fused_gating_intermediate
.
type
=
weight_type
;
ffn_weights
.
fused_gating_intermediate
.
type
=
weight_type
;
ffn_weights
.
fused_gating_intermediate
.
group_size
=
group_size
;
ffn_weights
.
fused_gating_intermediate
.
group_size
=
group_size
;
ffn_weights
.
fused_gating_intermediate
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
fused_gating_intermediate
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
fused_gating_intermediate
.
w4_pad_size
=
w4_pad_size
;
ffn_weights
.
output
.
input_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
output
.
input_dims
=
inter_size_
/
tensor_para_size_
;
ffn_weights
.
output
.
output_dims
=
hidden_units_
;
ffn_weights
.
output
.
output_dims
=
hidden_units_
;
ffn_weights
.
output
.
type
=
weight_type
;
ffn_weights
.
output
.
type
=
weight_type
;
ffn_weights
.
output
.
group_size
=
group_size
;
ffn_weights
.
output
.
group_size
=
group_size
;
ffn_weights
.
output
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
output
.
w4_weight_layout
=
w4_weight_layout
;
ffn_weights
.
output
.
w4_pad_size
=
w4_pad_size
;
mallocWeights
();
mallocWeights
();
}
}
...
@@ -118,16 +125,9 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
...
@@ -118,16 +125,9 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
else
{
// int8, int4
else
{
// int8, int4
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
FT_CHECK
(
weights
.
input_dims
%
factor
==
0
);
FT_CHECK
(
weights
.
input_dims
%
factor
==
0
);
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if
((
weights
.
input_dims
%
4096
==
0
)
&&
(
weights
.
w4_weight_layout
==
1
||
weights
.
w4_weight_layout
==
2
))
if
((
weights
.
input_dims
%
4096
==
0
)
&&
(
weights
.
w4_weight_layout
==
1
||
weights
.
w4_weight_layout
==
2
))
{
{
size_t
new_input_dims
=
weights
.
input_dims
+
2
*
weights
.
group_size
;
size_t
new_input_dims
=
weights
.
input_dims
+
weights
.
w4_pad_size
*
weights
.
group_size
;
deviceMalloc
((
int
**
)
&
weights
.
kernel
,
new_input_dims
*
weights
.
output_dims
/
factor
);
deviceMalloc
((
int
**
)
&
weights
.
kernel
,
new_input_dims
*
weights
.
output_dims
/
factor
);
deviceMemSetZero
((
int
*
)
weights
.
kernel
,
new_input_dims
*
weights
.
output_dims
/
factor
);
deviceMemSetZero
((
int
*
)
weights
.
kernel
,
new_input_dims
*
weights
.
output_dims
/
factor
);
...
@@ -171,15 +171,10 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
...
@@ -171,15 +171,10 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
}
}
else
{
// int8, int4
else
{
// int8, int4
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if
((
weights
.
input_dims
%
4096
==
0
)
&&
(
weights
.
w4_weight_layout
==
1
||
weights
.
w4_weight_layout
==
2
))
if
((
weights
.
input_dims
%
4096
==
0
)
&&
(
weights
.
w4_weight_layout
==
1
||
weights
.
w4_weight_layout
==
2
))
{
{
size_t
new_input_dims
=
weights
.
input_dims
+
weights
.
group_size
;
size_t
new_input_dims
=
weights
.
input_dims
+
weights
.
w4_pad_size
*
weights
.
group_size
;
output
.
insert
(
get_name
(
"qweight"
),
output
.
insert
(
get_name
(
"qweight"
),
Tensor
{
MEMORY_GPU
,
Tensor
{
MEMORY_GPU
,
...
@@ -189,7 +184,7 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
...
@@ -189,7 +184,7 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
output
.
insert
(
get_name
(
"scales_zeros"
),
output
.
insert
(
get_name
(
"scales_zeros"
),
Tensor
{
MEMORY_GPU
,
Tensor
{
MEMORY_GPU
,
getTensorType
<
T
>
(),
getTensorType
<
T
>
(),
{
new_input_dims
/
weights
.
group_size
*
weights
.
output_dims
*
2
*
sizeof
(
T
)},
{
new_input_dims
*
weights
.
output_dims
/
weights
.
group_size
*
2
*
sizeof
(
T
)},
weights
.
scales_and_zeros
});
weights
.
scales_and_zeros
});
}
}
else
{
else
{
...
@@ -307,23 +302,36 @@ void loadWeights(LlamaDenseWeight<T>& w,
...
@@ -307,23 +302,36 @@ void loadWeights(LlamaDenseWeight<T>& w,
FT_CHECK
(
dim1
%
factor
==
0
);
FT_CHECK
(
dim1
%
factor
==
0
);
// //读环境变量
if
(
w
.
w4_weight_layout
==
1
||
w
.
w4_weight_layout
==
2
)
//需要转置
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if
((
dim0
%
4096
==
0
)
&&
(
w
.
w4_weight_layout
==
1
||
w
.
w4_weight_layout
==
2
))
{
{
size_t
new_dim0
=
dim0
+
2
*
w
.
group_size
;
size_t
new_dim0
=
dim0
;
std
::
vector
<
size_t
>
w_shape
{
new_dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
if
(
dim0
%
4096
==
0
)
new_dim0
=
dim0
+
w
.
w4_pad_size
*
w
.
group_size
;
loadWeightFromBin
((
int8_t
*
)
w
.
kernel
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
//申请内存
const
size_t
group_count
=
w
.
group_size
>
0
?
new_dim0
/
w
.
group_size
:
1
;
int
*
kernel_workspace
=
nullptr
;
half
*
scales_workspace
=
nullptr
;
loadWeightFromBin
((
half
*
)
w
.
scales_and_zeros
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
deviceMalloc
((
int
**
)
&
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMemSetZero
((
int
*
)
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMalloc
((
half
**
)
&
scales_workspace
,
new_dim0
/
w
.
group_size
*
dim1
*
2
);
//加载weight
std
::
vector
<
size_t
>
w_shape
{
dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
loadWeightFromBin
((
int8_t
*
)
kernel_workspace
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
const
size_t
group_count
=
w
.
group_size
>
0
?
dim0
/
w
.
group_size
:
1
;
loadWeightFromBin
((
half
*
)
scales_workspace
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
//转置
reformat_s4_k_m8_tarnsw4
((
uint32_t
*
)
w
.
kernel
,(
uint32_t
*
)
kernel_workspace
,
dim1
,
new_dim0
,
0
);
reformat_s4_k_m8_tarnsscale
((
uint32_t
*
)
w
.
scales_and_zeros
,(
uint32_t
*
)
scales_workspace
,
dim1
,
new_dim0
/
w
.
group_size
,
0
);
//释放内存
cudaFree
(
kernel_workspace
);
cudaFree
(
scales_workspace
);
kernel_workspace
=
nullptr
;
scales_workspace
=
nullptr
;
}
}
else
{
else
{
std
::
vector
<
size_t
>
w_shape
{
dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
std
::
vector
<
size_t
>
w_shape
{
dim0
,
dim1
/
factor
*
sizeof
(
uint32_t
)};
loadWeightFromBin
((
int8_t
*
)
w
.
kernel
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
loadWeightFromBin
((
int8_t
*
)
w
.
kernel
,
w_shape
,
prefix
+
".qweight"
,
FtCudaDataType
::
INT8
,
{});
...
@@ -332,9 +340,57 @@ void loadWeights(LlamaDenseWeight<T>& w,
...
@@ -332,9 +340,57 @@ void loadWeights(LlamaDenseWeight<T>& w,
loadWeightFromBin
((
half
*
)
w
.
scales_and_zeros
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
loadWeightFromBin
((
half
*
)
w
.
scales_and_zeros
,
{
group_count
,
dim1
*
2
},
prefix
+
".scales_zeros"
,
type
,
{});
}
}
//在这里进行weight的pad以及转置
}
}
template
<
typename
T
>
void
transWeights
(
LlamaDenseWeight
<
T
>&
w
,
FtCudaDataType
model_file_type
)
{
const
auto
type
=
model_file_type
;
size_t
dim0
=
w
.
input_dims
;
size_t
dim1
=
w
.
output_dims
;
const
size_t
bit_size
=
getBitSize
(
w
.
type
);
const
int
factor
=
sizeof
(
float
)
*
8
/
bit_size
;
FT_CHECK
(
dim1
%
factor
==
0
);
if
(
w
.
w4_weight_layout
==
1
||
w
.
w4_weight_layout
==
2
)
//需要转置
{
size_t
new_dim0
=
dim0
;
if
(
dim0
%
4096
==
0
)
new_dim0
=
dim0
+
w
.
w4_pad_size
*
w
.
group_size
;
//申请内存
int
*
kernel_workspace
=
nullptr
;
half
*
scales_workspace
=
nullptr
;
deviceMalloc
((
int
**
)
&
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMemSetZero
((
int
*
)
kernel_workspace
,
new_dim0
*
dim1
/
factor
);
deviceMalloc
((
half
**
)
&
scales_workspace
,
new_dim0
/
w
.
group_size
*
dim1
*
2
);
deviceMemSetZero
((
half
*
)
scales_workspace
,
new_dim0
/
w
.
group_size
*
dim1
*
2
);
//拷贝加载weight
cudaD2Dcpy
((
int
*
)
kernel_workspace
,(
int
*
)
w
.
kernel
,
dim0
*
dim1
/
factor
);
cudaD2Dcpy
((
half
*
)
scales_workspace
,(
half
*
)
w
.
scales_and_zeros
,
dim0
/
w
.
group_size
*
dim1
*
2
);
//转置
reformat_s4_k_m8_tarnsw4
((
uint32_t
*
)
w
.
kernel
,(
uint32_t
*
)
kernel_workspace
,
dim1
,
new_dim0
,
0
);
reformat_s4_k_m8_tarnsscale
((
uint32_t
*
)
w
.
scales_and_zeros
,(
uint32_t
*
)
scales_workspace
,
dim1
,
new_dim0
/
w
.
group_size
,
0
);
//释放内存
cudaFree
(
kernel_workspace
);
cudaFree
(
scales_workspace
);
kernel_workspace
=
nullptr
;
scales_workspace
=
nullptr
;
}
}
}
}
template
<
typename
T
>
template
<
typename
T
>
void
LlamaDecoderLayerWeight
<
T
>::
mallocWeights
()
void
LlamaDecoderLayerWeight
<
T
>::
mallocWeights
()
{
{
...
@@ -420,6 +476,19 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
...
@@ -420,6 +476,19 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
}
}
}
}
template
<
typename
T
>
void
LlamaDecoderLayerWeight
<
T
>::
modifyModel
(
FtCudaDataType
model_file_type
)
{
const
auto
rank_spec
=
std
::
to_string
(
tensor_para_rank_
);
const
auto
type
=
model_file_type
;
transWeights
(
self_attn_weights
.
qkv
,
type
);
transWeights
(
self_attn_weights
.
output
,
type
);
transWeights
(
ffn_weights
.
fused_gating_intermediate
,
type
);
transWeights
(
ffn_weights
.
output
,
type
);
}
template
<
typename
T
>
template
<
typename
T
>
TensorMap
LlamaDecoderLayerWeight
<
T
>::
getParams
(
std
::
string
prefix
)
TensorMap
LlamaDecoderLayerWeight
<
T
>::
getParams
(
std
::
string
prefix
)
{
{
...
...
src/turbomind/models/llama/LlamaDecoderLayerWeight.h
View file @
14ad512a
...
@@ -36,6 +36,7 @@ public:
...
@@ -36,6 +36,7 @@ public:
WeightType
weight_type
,
WeightType
weight_type
,
int
group_size
,
int
group_size
,
int
w4_weight_layout
,
int
w4_weight_layout
,
int
w4_pad_size
,
bool
attn_bias
,
bool
attn_bias
,
size_t
tensor_para_size
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
);
size_t
tensor_para_rank
);
...
@@ -44,6 +45,7 @@ public:
...
@@ -44,6 +45,7 @@ public:
LlamaDecoderLayerWeight
&
operator
=
(
const
LlamaDecoderLayerWeight
&
other
)
=
delete
;
LlamaDecoderLayerWeight
&
operator
=
(
const
LlamaDecoderLayerWeight
&
other
)
=
delete
;
void
loadModel
(
std
::
string
dir_path
,
FtCudaDataType
model_file_type
);
void
loadModel
(
std
::
string
dir_path
,
FtCudaDataType
model_file_type
);
void
modifyModel
(
FtCudaDataType
model_file_type
);
TensorMap
getParams
(
std
::
string
prefix
);
TensorMap
getParams
(
std
::
string
prefix
);
...
...
src/turbomind/models/llama/LlamaDenseWeight.h
View file @
14ad512a
...
@@ -64,6 +64,7 @@ struct LlamaDenseWeight {
...
@@ -64,6 +64,7 @@ struct LlamaDenseWeight {
T
*
scales_and_zeros
;
T
*
scales_and_zeros
;
int
group_size
;
int
group_size
;
int
w4_weight_layout
;
int
w4_weight_layout
;
int
w4_pad_size
;
};
};
template
<
typename
T
>
template
<
typename
T
>
...
...
src/turbomind/models/llama/LlamaLinear.h
View file @
14ad512a
...
@@ -117,19 +117,19 @@ private:
...
@@ -117,19 +117,19 @@ private:
//检查xpad空间是否足够
//检查xpad空间是否足够
if
(
weight
.
input_dims
%
4096
==
0
)
//需要进行pad
if
(
weight
.
input_dims
%
4096
==
0
)
//需要进行pad
{
{
int
pad_group_count
=
2
;
input_padding
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
pad_group_count
);
input_padding
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
weight
.
w4_pad_size
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
cublas_wrapper_
->
Gemm
(
CUBLAS_OP_T
,
cublas_wrapper_
->
Gemm
(
CUBLAS_OP_T
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
weight
.
output_dims
,
//m
weight
.
output_dims
,
//m
batch_size
,
//n
batch_size
,
//n
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),
//[]
(
const
T
*
)
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),
//[]
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
cublas_wrapper_
->
xpading_workspace_
,
(
const
T
*
)
cublas_wrapper_
->
xpading_workspace_
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
output_data
,
output_data
,
weight
.
output_dims
);
//m
weight
.
output_dims
);
//m
}
}
...
@@ -155,8 +155,7 @@ private:
...
@@ -155,8 +155,7 @@ private:
//检查ck workspace 的空间是否足够
//检查ck workspace 的空间是否足够
if
(
weight
.
input_dims
%
4096
==
0
)
if
(
weight
.
input_dims
%
4096
==
0
)
{
{
int
pad_groupcount
=
2
;
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_data
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_data
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
pad_groupcount
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
}
}
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
else
{
else
{
...
@@ -208,19 +207,19 @@ private:
...
@@ -208,19 +207,19 @@ private:
//检查xpad空间是否足够
//检查xpad空间是否足够
if
(
weight
.
input_dims
%
4096
==
0
)
//需要进行pad
if
(
weight
.
input_dims
%
4096
==
0
)
//需要进行pad
{
{
int
pad_group_count
=
2
;
input_padding
<
T
>
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
pad_group_count
);
input_padding
<
T
>
(
stream_
,
reinterpret_cast
<
half
*>
(
cublas_wrapper_
->
xpading_workspace_
),(
const
T
*
)
input_data
,
batch_size
,
weight
.
input_dims
,
weight
.
group_size
,
weight
.
w4_pad_size
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
dequant_w4_gemm_colmajor
(
stream_
,
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),(
const
uint32_t
*
)
weight
.
kernel
,(
const
half2
*
)
weight
.
scales_and_zeros
,
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
weight
.
output_dims
,
weight
.
group_size
);
cublas_wrapper_
->
Gemm
(
CUBLAS_OP_T
,
cublas_wrapper_
->
Gemm
(
CUBLAS_OP_T
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
weight
.
output_dims
,
//m
weight
.
output_dims
,
//m
batch_size
,
//n
batch_size
,
//n
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),
//[]
(
const
T
*
)
reinterpret_cast
<
T
*>
(
cublas_wrapper_
->
deweight_workspace_
),
//[]
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
(
const
T
*
)
cublas_wrapper_
->
xpading_workspace_
,
(
const
T
*
)
cublas_wrapper_
->
xpading_workspace_
,
weight
.
input_dims
+
pad_group_count
*
weight
.
group_size
,
//k
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
,
//k
output_tmp
,
output_tmp
,
weight
.
output_dims
);
//m
weight
.
output_dims
);
//m
}
}
...
@@ -246,8 +245,8 @@ private:
...
@@ -246,8 +245,8 @@ private:
if
(
weight
.
input_dims
%
4096
==
0
)
if
(
weight
.
input_dims
%
4096
==
0
)
{
{
int
pad_groupcount
=
2
;
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_tmp
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
pad_groupcount
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
run_weight_only_gemm
(
reinterpret_cast
<
const
void
*>
(
input_data
),
reinterpret_cast
<
const
void
*>
(
weight
.
kernel
),
reinterpret_cast
<
const
void
*>
(
weight
.
scales_and_zeros
),
reinterpret_cast
<
void
*>
(
output_tmp
),
batch_size
,
weight
.
output_dims
,
(
weight
.
input_dims
),
(
weight
.
input_dims
),(
weight
.
input_dims
),
(
weight
.
input_dims
+
weight
.
w4_pad_size
*
weight
.
group_size
),
weight
.
output_dims
,
weight
.
group_size
,
reinterpret_cast
<
void
*>
(
cublas_wrapper_
->
ck_workspace_
),
CK_WORKSPACE_SIZE
,(
hipStream_t
)
stream_
);
}
}
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
else
{
else
{
...
...
src/turbomind/models/llama/LlamaWeight.cc
View file @
14ad512a
...
@@ -33,6 +33,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
...
@@ -33,6 +33,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
WeightType
weight_type
,
WeightType
weight_type
,
int
group_size
,
int
group_size
,
int
w4_weight_layout
,
int
w4_weight_layout
,
int
w4_pad_size
,
size_t
tensor_para_size
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
)
:
size_t
tensor_para_rank
)
:
hidden_units_
(
head_num
*
size_per_head
),
hidden_units_
(
head_num
*
size_per_head
),
...
@@ -57,6 +58,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
...
@@ -57,6 +58,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
weight_type_
,
weight_type_
,
group_size
,
group_size
,
w4_weight_layout
,
w4_weight_layout
,
w4_pad_size
,
attn_bias
,
attn_bias
,
tensor_para_size_
,
tensor_para_size_
,
tensor_para_rank_
));
tensor_para_rank_
));
...
@@ -69,7 +71,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
...
@@ -69,7 +71,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
std
::
string
str_w4_weight_layout
=
std
::
to_string
(
w4_weight_layout
);
std
::
string
str_w4_weight_layout
=
std
::
to_string
(
w4_weight_layout
);
const
char
*
env_value
=
str_w4_weight_layout
.
c_str
();
const
char
*
env_value
=
str_w4_weight_layout
.
c_str
();
setenv
(
env_name
,
env_value
,
1
);
setenv
(
env_name
,
env_value
,
1
);
//printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout);
}
}
else
else
{
{
...
@@ -128,8 +130,23 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
...
@@ -128,8 +130,23 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
for
(
unsigned
layer
=
0
;
layer
<
num_layer_
;
++
layer
)
{
for
(
unsigned
layer
=
0
;
layer
<
num_layer_
;
++
layer
)
{
decoder_layer_weights
[
layer
]
->
loadModel
(
dir_path
+
"layers."
+
std
::
to_string
(
layer
),
model_file_type
);
decoder_layer_weights
[
layer
]
->
loadModel
(
dir_path
+
"layers."
+
std
::
to_string
(
layer
),
model_file_type
);
}
}
}
template
<
typename
T
>
void
LlamaWeight
<
T
>::
modifyModel
()
{
FtCudaDataType
model_file_type
=
FtCudaDataType
::
FP16
;
if
(
weight_type_
==
WeightType
::
kBF16
){
model_file_type
=
FtCudaDataType
::
BF16
;
}
for
(
unsigned
layer
=
0
;
layer
<
num_layer_
;
++
layer
)
{
decoder_layer_weights
[
layer
]
->
modifyModel
(
model_file_type
);
}
}
}
template
<
typename
T
>
template
<
typename
T
>
TensorMap
LlamaWeight
<
T
>::
getParams
()
TensorMap
LlamaWeight
<
T
>::
getParams
()
{
{
...
...
src/turbomind/models/llama/LlamaWeight.h
View file @
14ad512a
...
@@ -38,6 +38,7 @@ struct LlamaWeight {
...
@@ -38,6 +38,7 @@ struct LlamaWeight {
WeightType
weight_type
,
WeightType
weight_type
,
int
group_size
,
int
group_size
,
int
w4_weight_layout
,
int
w4_weight_layout
,
int
w4_pad_size
,
size_t
tensor_para_size
,
size_t
tensor_para_size
,
size_t
tensor_para_rank
);
size_t
tensor_para_rank
);
...
@@ -47,6 +48,7 @@ struct LlamaWeight {
...
@@ -47,6 +48,7 @@ struct LlamaWeight {
LlamaWeight
&
operator
=
(
const
LlamaWeight
&
other
)
=
delete
;
LlamaWeight
&
operator
=
(
const
LlamaWeight
&
other
)
=
delete
;
void
loadModel
(
std
::
string
dir_path
);
void
loadModel
(
std
::
string
dir_path
);
void
modifyModel
();
TensorMap
getParams
();
TensorMap
getParams
();
...
...
src/turbomind/python/bind.cpp
View file @
14ad512a
...
@@ -439,6 +439,11 @@ PYBIND11_MODULE(_turbomind, m)
...
@@ -439,6 +439,11 @@ PYBIND11_MODULE(_turbomind, m)
py
::
call_guard
<
py
::
gil_scoped_release
>
(),
py
::
call_guard
<
py
::
gil_scoped_release
>
(),
"device_id"
_a
,
"device_id"
_a
,
"rank"
_a
)
"rank"
_a
)
.
def
(
"modify_shared_weights"
,
&
AbstractTransformerModel
::
modifySharedWeights
,
py
::
call_guard
<
py
::
gil_scoped_release
>
(),
"device_id"
_a
,
"rank"
_a
)
.
def
(
.
def
(
"get_params"
,
"get_params"
,
[](
AbstractTransformerModel
*
model
,
int
deviceId
,
int
rank
)
{
[](
AbstractTransformerModel
*
model
,
int
deviceId
,
int
rank
)
{
...
...
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
View file @
14ad512a
...
@@ -187,6 +187,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
...
@@ -187,6 +187,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
quant_policy_
=
reader
.
GetInteger
(
"llama"
,
"quant_policy"
,
0
);
quant_policy_
=
reader
.
GetInteger
(
"llama"
,
"quant_policy"
,
0
);
group_size_
=
reader
.
GetInteger
(
"llama"
,
"group_size"
,
0
);
group_size_
=
reader
.
GetInteger
(
"llama"
,
"group_size"
,
0
);
w4_weight_layout_
=
reader
.
GetInteger
(
"llama"
,
"w4_weight_layout"
,
2
);
w4_weight_layout_
=
reader
.
GetInteger
(
"llama"
,
"w4_weight_layout"
,
2
);
w4_pad_size_
=
reader
.
GetInteger
(
"llama"
,
"w4_pad_size"
,
2
);
// rotary embedding parameters
// rotary embedding parameters
attn_params_
.
rotary_embedding_dim
=
reader
.
GetInteger
(
"llama"
,
"rotary_embedding"
);
attn_params_
.
rotary_embedding_dim
=
reader
.
GetInteger
(
"llama"
,
"rotary_embedding"
);
...
@@ -383,6 +384,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
...
@@ -383,6 +384,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
weight_type_
,
weight_type_
,
group_size_
,
group_size_
,
w4_weight_layout_
,
w4_weight_layout_
,
w4_pad_size_
,
tensor_para_size_
,
tensor_para_size_
,
tensor_para_rank
);
tensor_para_rank
);
// model inited with model_dir
// model inited with model_dir
...
@@ -392,6 +394,21 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
...
@@ -392,6 +394,21 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
return
;
return
;
}
}
template
<
typename
T
>
void
LlamaTritonModel
<
T
>::
modifySharedWeights
(
int
device_id
,
int
rank
)
{
ft
::
check_cuda_error
(
cudaSetDevice
(
device_id
));
const
int
tensor_para_rank
=
rank
%
tensor_para_size_
;
const
int
pipeline_para_rank
=
rank
/
tensor_para_size_
;
ft
::
FT_CHECK
(
pipeline_para_size_
==
1
&&
pipeline_para_rank
==
0
);
if
(
weight_type_
==
turbomind
::
WeightType
::
kINT4
)
{
shared_weights_
[
device_id
]
->
modifyModel
();
}
return
;
}
template
<
typename
T
>
template
<
typename
T
>
TensorMap
LlamaTritonModel
<
T
>::
getParams
(
int
deviceId
,
int
rank
)
TensorMap
LlamaTritonModel
<
T
>::
getParams
(
int
deviceId
,
int
rank
)
{
{
...
...
src/turbomind/triton_backend/llama/LlamaTritonModel.h
View file @
14ad512a
...
@@ -53,6 +53,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
...
@@ -53,6 +53,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
std
::
shared_ptr
<
ft
::
AbstractCustomComm
>
custom_all_reduce_comm
=
nullptr
)
override
;
std
::
shared_ptr
<
ft
::
AbstractCustomComm
>
custom_all_reduce_comm
=
nullptr
)
override
;
void
createSharedWeights
(
int
deviceId
,
int
rank
)
override
;
void
createSharedWeights
(
int
deviceId
,
int
rank
)
override
;
void
modifySharedWeights
(
int
deviceId
,
int
rank
)
override
;
TensorMap
getParams
(
int
deviceId
,
int
rank
)
override
;
TensorMap
getParams
(
int
deviceId
,
int
rank
)
override
;
...
@@ -102,6 +103,7 @@ private:
...
@@ -102,6 +103,7 @@ private:
int
quant_policy_
;
int
quant_policy_
;
int
group_size_
;
int
group_size_
;
int
w4_weight_layout_
;
int
w4_weight_layout_
;
int
w4_pad_size_
;
// shared weights for each device
// shared weights for each device
std
::
vector
<
std
::
shared_ptr
<
ft
::
LlamaWeight
<
T
>>>
shared_weights_
;
std
::
vector
<
std
::
shared_ptr
<
ft
::
LlamaWeight
<
T
>>>
shared_weights_
;
...
...
src/turbomind/triton_backend/transformer_triton_backend.hpp
View file @
14ad512a
...
@@ -325,6 +325,7 @@ struct AbstractTransformerModel {
...
@@ -325,6 +325,7 @@ struct AbstractTransformerModel {
std
::
shared_ptr
<
ft
::
AbstractCustomComm
>
custom_all_reduce_comm
=
nullptr
)
=
0
;
std
::
shared_ptr
<
ft
::
AbstractCustomComm
>
custom_all_reduce_comm
=
nullptr
)
=
0
;
virtual
void
createSharedWeights
(
int
deviceId
,
int
rank
)
=
0
;
virtual
void
createSharedWeights
(
int
deviceId
,
int
rank
)
=
0
;
virtual
void
modifySharedWeights
(
int
deviceId
,
int
rank
)
=
0
;
virtual
TensorMap
getParams
(
int
deviceId
,
int
rank
)
=
0
;
virtual
TensorMap
getParams
(
int
deviceId
,
int
rank
)
=
0
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment