Commit 14ad512a authored by gaoqiong's avatar gaoqiong
Browse files

增加awq 多卡支持

parent 6ba90df9
This diff is collapsed.
......@@ -67,7 +67,9 @@ yum install rapidjson
export NCCL_LIB_DIR=/opt/dtk/cuda/lib64
pip3 install -r requirements.txt
pip3 install urllib3==1.24
#apt-get 换源,添加清华源
apt-get install rapidjson-dev
#若安装不上则需要apt-get 换源,添加清华源
#添加清华源后更新
#vim /etc/apt/sources.list
#添加清华源如下:
......@@ -75,17 +77,17 @@ pip3 install urllib3==1.24
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
#deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
#换源完成后进行更新
sudo apt-get update
apt-get install rapidjson-dev
#换源完成后进行更新再重新安装
#sudo apt-get update
# 执行nccl环境变量
export NCCL_LAUNCH_MODE=GROUP
```
注:
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
2、gpufusion wget指令提供的网址可能会有变化,可以进入提供网页下载对应压缩工具包
1、docker启动 -v /opt/hyhal:/opt/hyhal 这个变量不能少
2、gpufusion wget指令提供的网址可能会有变化,可以进入提供网页下载对应压缩工具包
3、若使用DTK24041 pytorch镜像中进行编译,其中镜像中dtk自带有gpufusion文件,在此项目编译过程总需要更换其中一个文件,lmdeploy/3rdparty/gpufusion/nccl.h 放入 /opt/dtk/cuda/include 路径下
#### 源码编译安装
- 代码下载
......@@ -113,13 +115,16 @@ cd dist && pip3 install lmdeploy*
### 模型转换
```bash
# <model_name> 模型的名字 ('llama', 'internlm', 'vicuna', 'wizardlM', 'internlm-chat-7b', 'internlm-chat', 'internlm-chat-7b-8k', 'internlm-chat-20b', 'internlm-20b', 'baichuan-7b', 'baichuan2-7b', 'puyu', 'llama2', 'qwen-7b', 'qwen-14b', 'qwen-72b', 'codellama', 'solar', 'ultralm', 'ultracm', 'yi')
# <model_name> 模型的名字 (['base', 'llama', 'vicuna', 'wizardlm', 'internlm', 'internlm-cha-chat-7b-8k', 'internlm-chat-20b', 'internlm-20b', 'internlm2-1_8b', 'internlm2-7b', 'internlm2-20b', 'internlm2', 'internlm2-chat', 'internlm2-cinternlm2-chat-20b', 'baichuan-base', 'baichuan-7b', 'baichuan2', 'baichuan2-7b', 'puyu', 'llama2', 'llama-2', 'llama-2-chat', 'qwen', 'qwen-7b',ma', 'falcon', 'chatglm', 'chatglm2-6b', 'solar', 'solar-70b', 'ultralm', 'ultracm', 'yi', 'yi-chat', 'yi-200k', 'yi-34b', 'Mistral-7B-Instruct',l', 'mixtral', 'gemma', 'deepseek', 'deepseek-chat', 'yi-vl']
# <model_path> 模型路径
# <model_format> 模型的格式 ('llama', 'hf', None。可以不写默认None,代码会根据模型选择格式,一般选择不写
# <model_format> 保存输出的目标路径(默认./workspace)
# <model_format> 模型的格式 ('awq', 'hf','llama'
# <dst_path> 保存输出的目标路径(默认./workspace)
# <tp> 用于张量并行的GPU数量应该是2^n
# <quant_model_path> AWQ量化模型
#若采用fp16模型
lmdeploy convert ${model_name} ${model_path} --model-format ${model_format} --dst-path ${dst_path} --tp ${tp}
#若采用AWQ模型
lmdeploy convert ${model_name} ${quant_model_path} --model-format awq --group-size 128 --tp ${tp} --dst-path ${dst_path}
```
### 运行
#### bash界面运行
......@@ -155,21 +160,41 @@ api-server的详细使用可以参照[这里](docs/zh_cn/serving)的文档
codellama模型的部署可以参照[codellama](docs/zh_cn/supported_models/codellama.md)
### AWQ 量化推理
## AWQ 量化推理
本版本支持量化推理功能,步骤如下:
```bash
#group_size:按照模型量化时候的分组参数,一般为128
#采用数据量化
#可以根据需求采用需要的数据集进行量化,以下以c4作为数据集进行量化示例
#修改lmdeploy/lmdeploy/lite/utils/calib_dataloader.py get_c4()函数,更改为本地数据集路径
lmdeploy lite auto_awq ${model_path} --calib-dataset 'c4' --calib-samples 128 --calib-seqlen 2048 --w-bits 4 --w-group-size 128 --work-dir ${quant_model_path}
#group_size:按照模型量化时候的分组参数,仅支持128
#<tp> 用于张量并行的GPU数量应该是2^n
#<dst-path> 保存模型的目标文件夹
#step1:模型转换:
lmdeploy convert ${model_name} ${model_path} --model_format awq --group-size ${group_size} --tp ${tp} --dst-path ${dst_path}
lmdeploy convert ${model_name} ${quant_model_path} --model-format awq --group-size 128 --tp ${tp} --dst-path ${dst_path}
#step1:模型运行
lmdeploy chat turbomind ${dst_path} --tp ${tp}
```
注意事项:
1.该版本暂时仅支持tp=1 单卡量化推理,仅支持卡型KM-AI,暂不支持K100/Z100/Z100L;
2.该版本量化推理功能仅支持先通过convert模型转换为turbomind格式,然后进行推理运行,暂时不知道hf模型直接量化推理;
3.该版本暂时不支持通过数据集进行量化功能,需要在别处获取量化模型;
1.该版本仅支持卡型KM-AI,暂不支持K100/Z100/Z100L;
2.在进行benchmark测评时,AWQ模型不支持使用hf 模型直接进行评测,推荐先使用工具将量化模型转换为turbomind格式,且执行的tp数据需和模型转换时的tp指定数量一致;
3.llama2-70b与qwen-72b模型在做数据集量化时,calib-samples参数推荐设置为120;
4.多卡支持模型列表如下:
| 模型 | AWQ TP=1 | AWQ TP=2 | AWQ TP=4
| :----------: | :------: | :--: | :--: |
| Llama2-7B-chat | Yes | Yes | No |
| Llama2-13B-chat | Yes | Yes | Yes |
| Llama2-70B-chat | Yes | Yes | Yes |
| qwen-7B-chat | Yes | Yes | No |
| qwen-14B-chat | Yes | No | No |
| qwen-72B-chat | Yes | Yes | Yes |
备注:qwen-14b-chat模型不支持多卡AWQ量化推理原因为其中有size为[13696,5120]的gemm,当group_size为128时,scale shape为[13696/128,5120]=[107,5120],107不能被tp=2或者4整除。您可以依据此特点来判断您的模型能都支持AWQ多卡推理。
## result
![qwen推理](docs/dcu/interlm.gif)
......
......@@ -66,6 +66,11 @@ class CLI(object):
type=int,
default=2,
help='A parameter used in AWQ to control the layout of weight ')
parser.add_argument(
'--w4-pad-size',
type=int,
default=2,
help='A parameter used in AWQ to control the pad size of weight ')
parser.set_defaults(run=CLI.convert)
@staticmethod
......
......@@ -197,6 +197,7 @@ def main(model_name: str,
quant_path: str = None,
group_size: int = 0,
w4_weight_layout: int = 2,
w4_pad_size:int =2,
**kwargs):
"""deploy llama family models via turbomind.
......@@ -217,6 +218,7 @@ def main(model_name: str,
group_size (int): a parameter used in AWQ to quantize fp16 weights
to 4 bits
w4_weight_layout (int) :a parameter used in AWQ to control the layout of weight
w4_pad_size(int): a parameter used in AWQ to control the layout of weight
kwargs (dict): other params for convert
"""
......@@ -263,12 +265,15 @@ def main(model_name: str,
cfg.rotary_embedding = cfg.size_per_head
cfg.group_size = group_size
cfg.w4_weight_layout=w4_weight_layout
cfg.w4_pad_size =w4_pad_size
if inferred_model_format.find('awq') != -1:
cfg.weight_type = 'int4'
output_format = 'w4'
assert group_size > 0, f'group_size: {group_size} should > 0'
print("w4_weight_layout:",w4_weight_layout)
#print("w4_weight_layout:",w4_weight_layout)
assert w4_weight_layout>=0 and w4_weight_layout<3,f'w4_weight_layout: {w4_weight_layout} should >= 0 and < 3'
assert w4_pad_size>=0 and w4_pad_size<5 ,f'w4_pad_size should >= 0 and <5'
else:
#output_format = update_output_format(model_name, inferred_model_format,
# model_path, output_format)
......
......@@ -54,6 +54,7 @@ class TurbomindModelConfig:
size_per_head: int = 128
group_size: int = 0
w4_weight_layout : int = 2
w4_pad_size: int =2
max_batch_size: int = 64
max_context_token_num: int = 1
step_length: int = 1
......@@ -208,6 +209,7 @@ class BaseOutputModel(ABC):
param = param.to(torch_type)
tprint(name, param.shape)
_tofile(param, osp.join(self.out_dir, name))
elif len(self.tm_params) > 0:
tm_params = self.tm_params
weight_type = self.cfg.weight_type
......@@ -228,6 +230,7 @@ class BaseOutputModel(ABC):
torch_tensor = torch_tensor.float()
for tm_tensor in tm_params[name]:
tm_tensor.copy_from(torch_tensor)
tm_params.pop(name)
else:
tprint('skip export', name, param.shape)
......@@ -325,10 +328,6 @@ def permute(x: torch.Tensor, size_per_head: int = 128):
return x.view(n_heads, 2, dim // n_heads // 2,
1).transpose(1, 2).reshape(dim, 1)
def permute_trans(x: torch.Tensor):
if x.shape[-1]>1:
dim = x.shape[-1]
return x.view(-1, x.shape[-1]).transpose(0, 1).reshape(dim,-1)
def merge_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, tp: int,
dim: int):
......
......@@ -8,7 +8,7 @@ import lmdeploy
from ..source_model.base import BaseInputModel, BaseReader
from .base import (OUTPUT_MODELS, BaseOutputModel, TurbomindModelConfig,
merge_qkv, permute,permute_trans)
merge_qkv, permute)
# import _turbomind as _tm
# TODO: find another way import _turbomind
......@@ -117,6 +117,7 @@ class TurbomindW4Model(BaseOutputModel):
group_size = self.cfg.group_size
tp = self.cfg.tensor_para_size
w4_weight_layout = self.cfg.w4_weight_layout
w4_pad_size = self.cfg.w4_pad_size
size_per_head = self.cfg.size_per_head
# attn
q_qw, k_qw, v_qw, o_qw = get_cuda_tensor(bin.attn(i))
......@@ -134,48 +135,15 @@ class TurbomindW4Model(BaseOutputModel):
qkv_qz = merge_qkv(q_qz, k_qz, v_qz, tp, dim=2)
qkv_s = merge_qkv(q_s, k_s, v_s, tp, dim=2)
pad_group_count=2
if w4_weight_layout==1 or w4_weight_layout==2:
if qkv_qw.shape[0]%4096==0:
qkv_qw_padding=torch.zeros(group_size*pad_group_count,qkv_qw.shape[1],dtype=torch.int32).cuda()
qkv_qw =torch.cat((qkv_qw,qkv_qw_padding),dim=0).contiguous()
qkv_qz_padding =torch.zeros(pad_group_count,qkv_qz.shape[1],dtype=torch.int32).cuda()
qkv_qz =torch.cat((qkv_qz,qkv_qz_padding),dim=0).contiguous()
qkv_s_padding =torch.zeros(pad_group_count,qkv_s.shape[1],dtype=torch.float16).cuda()
qkv_s =torch.cat((qkv_s,qkv_s_padding),dim=0).contiguous()
qkv_qw, qkv_sz = convert_s4_(qkv_qw, qkv_qz, qkv_s, group_size)
qkv_qw = tp_m_s4(qkv_qw, tp)
qkv_sz = permute_trans(qkv_sz)
else:
qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
qkv_qw = tp_m_s4(qkv_qw, tp)
#print("请设置weight layout\n")
qkv_qw, qkv_sz = convert_s4(qkv_qw, qkv_qz, qkv_s, group_size)
self.save_split(qkv_qw, f'layers.{i}.attention.w_qkv.qweight', -1)
self.save_split(qkv_sz, f'layers.{i}.attention.w_qkv.scales_zeros', -1)
if w4_weight_layout==1 or w4_weight_layout==2:
if o_qw.shape[0]%4096==0:
o_qw_padding=torch.zeros(group_size*pad_group_count,o_qw.shape[1],dtype=torch.int32).cuda()
o_qw =torch.cat((o_qw,o_qw_padding),dim=0).contiguous()
o_qz_padding =torch.zeros(pad_group_count,o_qz.shape[1],dtype=torch.int32).cuda()
o_qz =torch.cat((o_qz,o_qz_padding),dim=0).contiguous()
o_s_padding =torch.zeros(pad_group_count,o_s.shape[1],dtype=torch.float16).cuda()
o_s =torch.cat((o_s,o_s_padding),dim=0).contiguous()
o_qw, o_sz = convert_s4_(o_qw, o_qz, o_s, group_size)
o_sz = permute_trans(o_sz)
else:
o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
o_qw, o_sz = convert_s4(o_qw, o_qz, o_s, group_size)
self.save_split(o_qw, f'layers.{i}.attention.wo.qweight', 0)
self.save_split(o_sz, f'layers.{i}.attention.wo.scales_zeros', 0)
q_b, k_b, v_b, o_b = get_cuda_tensor(bin.attn_bias(i))
if q_b is not None:
q_b = permute(q_b, size_per_head)
......@@ -184,6 +152,7 @@ class TurbomindW4Model(BaseOutputModel):
self.save_split(qkv_b, f'layers.{i}.attention.w_qkv.bias', -1)
self.save_split(o_b, f'layers.{i}.attention.wo.bias', copy=True)
# ffn weights
w1_qw, w2_qw, w3_qw = get_cuda_tensor(bin.ffn(i))
w1_qz, w2_qz, w3_qz = get_cuda_tensor(bin.ffn_zero(i))
......@@ -191,45 +160,12 @@ class TurbomindW4Model(BaseOutputModel):
w13_qw, w13_qz, w13_s = fuse_w1_w3_s4(w1_qw, w1_qz, w1_s, w3_qw, w3_qz,
w3_s)
if w4_weight_layout==1 or w4_weight_layout==2:
if w13_qw.shape[0]%4096==0:
w13_qw_padding=torch.zeros(group_size*pad_group_count,w13_qw.shape[1],dtype=torch.int32).cuda()
w13_qw =torch.cat((w13_qw,w13_qw_padding),dim=0).contiguous()
w13_qz_padding =torch.zeros(pad_group_count,w13_qz.shape[1],dtype=torch.int32).cuda()
w13_qz =torch.cat((w13_qz,w13_qz_padding),dim=0).contiguous()
w13_s_padding =torch.zeros(pad_group_count,w13_s.shape[1],dtype=torch.float16).cuda()
w13_s =torch.cat((w13_s,w13_s_padding),dim=0).contiguous()
w13_qw, w13_sz = convert_s4_(w13_qw, w13_qz, w13_s, group_size)
w13_qw = tp_m_s4(w13_qw, tp)
w13_sz = permute_trans(w13_sz)
else:
w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
w13_qw = tp_m_s4(w13_qw, tp)
w13_qw, w13_sz = convert_s4(w13_qw, w13_qz, w13_s, group_size)
self.save_split(w13_qw, f'layers.{i}.feed_forward.w13.qweight', -1)
self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',
-1)
if w4_weight_layout==1 or w4_weight_layout==2:
#pading
if w2_qw.shape[0]%4096==0:
w2_qw_padding=torch.zeros(group_size*pad_group_count,w2_qw.shape[1],dtype=torch.int32).cuda()
w2_qw =torch.cat((w2_qw,w2_qw_padding),dim=0).contiguous()
w2_qz_padding =torch.zeros(pad_group_count,w2_qz.shape[1],dtype=torch.int32).cuda()
w2_qz =torch.cat((w2_qz,w2_qz_padding),dim=0).contiguous()
w2_s_padding =torch.zeros(pad_group_count,w2_s.shape[1],dtype=torch.float16).cuda()
w2_s =torch.cat((w2_s,w2_s_padding),dim=0).contiguous()
w2_qw, w2_sz = convert_s4_(w2_qw, w2_qz, w2_s, group_size)
w2_sz = permute_trans(w2_sz)
else:
w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
self.save_split(w13_sz, f'layers.{i}.feed_forward.w13.scales_zeros',-1)
w2_qw, w2_sz = convert_s4(w2_qw, w2_qz, w2_s, group_size)
self.save_split(w2_qw, f'layers.{i}.feed_forward.w2.qweight', 0)
self.save_split(w2_sz, f'layers.{i}.feed_forward.w2.scales_zeros', 0)
......
......@@ -148,6 +148,7 @@ class TurboMind:
model_format: Optional[str] = None,
group_size: Optional[int] = None,
w4_weight_layout: Optional[int] = None,
w4_pad_size: Optional[int] = None,
tp: Optional[int] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
**kwargs):
......@@ -181,6 +182,7 @@ class TurboMind:
model_format=model_format,
group_size=group_size,
w4_weight_layout=w4_weight_layout,
w4_pad_size=w4_pad_size,
tp=tp,
**kwargs)
......@@ -237,6 +239,28 @@ class TurboMind:
threads.append(t)
for t in threads:
t.join()
def _modify_weight(self, model_comm):
"""modify weight if from_hf with awq."""
# TODO: support mpi
self.node_id = 0
self.node_num = 1
self.nccl_params = model_comm.create_nccl_params(self.node_id)
torch.cuda.synchronize()
def _modify_weight_func(device_id):
with cuda_ctx(device_id):
rank = self.node_id * self.gpu_count + device_id
model_comm.modify_shared_weights(device_id, rank)
threads = []
for device_id in range(self.gpu_count):
t = Thread(target=_modify_weight_func, args=(device_id, ))
t.start()
threads.append(t)
for t in threads:
t.join()
def _load_kv_qparams(self, model_path, tm_params, **kwargs):
"""Load kv qparams when loading from hf."""
......@@ -271,10 +295,10 @@ class TurboMind:
t.join()
for _ in range(self.gpu_count):
tensor_map = que.get()
tensor_map = que.get()
for k, v in tensor_map.items():
if k not in tm_params:
tm_params[k] = []
tm_params[k] = []
tm_params[k].append(v)
def _from_hf(self, model_source: ModelSource, model_path: str,
......@@ -307,10 +331,11 @@ class TurboMind:
data_type = 'int4'
cfg.group_size = 128
cfg.w4_weight_layout=2
cfg.w4_pad_size=0
else:
# output_format = update_output_format(cfg.model_name,
# inferred_model_format,
# model_path, output_format)
output_format = update_output_format(cfg.model_name,
inferred_model_format,
model_path, output_format)
data_type = output_format
update_config_weight_type(output_format, cfg)
......@@ -342,12 +367,15 @@ class TurboMind:
# copy hf model weight to turbomind weight
tm_params = output_model.tm_params
self._get_model_params(model_comm, tm_params)
logger.warning(f'get {len(tm_params)} model params')
output_model.export()
self._modify_weight(model_comm)
# load kv qparams
self._load_kv_qparams(model_path, tm_params, kv_sym=False, kv_bits=8)
assert len(tm_params) == 0, f'missing {tm_params.keys()}'
return model_comm
......@@ -381,7 +409,6 @@ class TurboMind:
self.config = cfg
self.model_name = cfg.model_name
self.data_type = cfg.weight_type
#print("from_workspace_cfg:",cfg)
# create model
logger.warning(f'model_config:\n\n{cfg.toini()}')
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
__dcu_version__ = '0.2.6'
__dcu_version__ = '0.2.6+das1.1.git7063377.abi0.dtk2404.torch2.1.0'
__version__ = '0.2.6'
short_version = __version__
......
......@@ -76,15 +76,75 @@ void reformat_s4_k_m8(uint32_t* dst, const uint32_t* src, int m, int k, cudaStre
permute_u4<0, 1, 2, 3, 4, 5, 6, 7, 8, 9><<<512, 512, 0, st>>>(dst, src, shape);
}
template <typename T>
void PrintData(cudaStream_t stream, const T* input,int size)
{
int input_size=size;
T* h_data;
h_data=new T[input_size];
cudaMemcpy(h_data,input, input_size * sizeof(T), cudaMemcpyDeviceToHost);
if constexpr (std::is_same<T, half>::value)
{
for(int i=0;i<input_size;i++)
{
printf("%f ",__half2float(h_data[i]));
}
}
else if constexpr(std::is_same<T, half2>::value)
{
for(int i=0;i<input_size;i++)
{
printf("x:%f y:%f ",__half2float(h_data[i].data[0]),__half2float(h_data[i].data[1]));
}
}
else if constexpr(std::is_same<T, uint32_t>::value)
{
for(int i=0;i<input_size;i++)
{
printf(" %u ",h_data[i]);
}
}
printf("\n");
delete[] h_data;
}
#define INSTANTIATEPRINTDATA(T) \
template void PrintData(cudaStream_t stream, const T* input,int size);
INSTANTIATEPRINTDATA(__half)
INSTANTIATEPRINTDATA(float)
INSTANTIATEPRINTDATA(half2)
INSTANTIATEPRINTDATA(uint32_t)
void reformat_s4_k_m8_tarnsw4(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st)
{
// permutation for [k, m/8] layout
Array<int, 10> shape{1, k / 8, 2, 2, 2, 1, m / 8, 2, 2, 2};
// 0123456-->4,6,7,5,0,3,1,2
//permute_u4<4, 6, 7, 5, 0, 3, 1, 2><<<512, 512, 0, st>>>(dst, src, shape);
permute_u4<5, 6, 8, 9, 7, 0, 1, 4, 2, 3><<<512, 512, 0, st>>>(dst, src, shape);
}
__global__ void permute_u32(int num_kernels, uint32_t* dst, const uint32_t* src, int m, int k)
{
//[k,m]-->[m,k]
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id >= num_kernels) return;
int j=id%k;
int i=id/k;
dst[id]=src[j*m+i];
}
void reformat_s4_k_m8_tarnsscale(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st)
{
// permutation for [k, m] layout
int num_kernels=k*m;
permute_u32<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,st>>>(num_kernels,dst, src,m,k);
}
__global__ void dequantize_s4_offset_64(uint4* dst, const uint32_t* src, size_t count)
{
for (int i = threadIdx.x + blockDim.x * blockIdx.x; i < count; i += blockDim.x * gridDim.x) {
......@@ -269,8 +329,7 @@ __global__ void input_padding_kernel(int num_kernels,T* output,const T* input,in
template <typename T>
void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount)
{
//input的size是[m,k],output的size是[m,n+group_size]
//
int num_kernels=m*(k+pad_groupcount*group_size);
input_padding_kernel<<<(num_kernels+BLOCKSIZE-1)/BLOCKSIZE,BLOCKSIZE,0,stream>>>(num_kernels, output,input,m,k,group_size,pad_groupcount);
}
......@@ -282,3 +341,5 @@ template void input_padding(cudaStream_t stream, T* output,const T* input,int m,
INSTANTIATEINPUTPADING(__half)
} // namespace turbomind
......@@ -36,6 +36,11 @@ void dequant_w4_gemm_colmajor(cudaStream_t stream, half* output,const uint32_t*
template <typename T>
void input_padding(cudaStream_t stream, T* output,const T* input,int m,int k,int group_size,int pad_groupcount);
void reformat_s4_k_m8_tarnsw4(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st);
void reformat_s4_k_m8_tarnsscale(uint32_t* dst, const uint32_t* src, int m, int k, cudaStream_t st);
template <typename T>
void PrintData(cudaStream_t stream, const T* input,int size);
class GemmS4F16 {
public:
GemmS4F16();
......
......@@ -17,7 +17,7 @@
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/utils/logger.h"
......@@ -42,6 +42,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
WeightType weight_type,
int group_size,
int w4_weight_layout,
int w4_pad_size,
bool attn_bias,
size_t tensor_para_size,
size_t tensor_para_rank):
......@@ -60,36 +61,42 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t head_num,
self_attn_weights.qkv.type = weight_type;
self_attn_weights.qkv.group_size = group_size;
self_attn_weights.qkv.w4_weight_layout = w4_weight_layout;
self_attn_weights.qkv.w4_pad_size = w4_pad_size;
self_attn_weights.output.input_dims = hidden_units_ / tensor_para_size_;
self_attn_weights.output.output_dims = hidden_units_;
self_attn_weights.output.type = weight_type;
self_attn_weights.output.group_size = group_size;
self_attn_weights.output.w4_weight_layout = w4_weight_layout;
self_attn_weights.output.w4_pad_size = w4_pad_size;
ffn_weights.gating.input_dims = hidden_units_;
ffn_weights.gating.output_dims = inter_size_ / tensor_para_size_;
ffn_weights.gating.type = weight_type;
ffn_weights.gating.group_size = group_size;
ffn_weights.gating.w4_weight_layout = w4_weight_layout;
ffn_weights.gating.w4_pad_size = w4_pad_size;
ffn_weights.intermediate.input_dims = hidden_units_;
ffn_weights.intermediate.output_dims = inter_size_ / tensor_para_size_;
ffn_weights.intermediate.type = weight_type;
ffn_weights.intermediate.group_size = group_size;
ffn_weights.intermediate.w4_weight_layout = w4_weight_layout;
ffn_weights.intermediate.w4_pad_size = w4_pad_size;
ffn_weights.fused_gating_intermediate.input_dims = hidden_units_;
ffn_weights.fused_gating_intermediate.output_dims = inter_size_ / tensor_para_size_ * 2;
ffn_weights.fused_gating_intermediate.type = weight_type;
ffn_weights.fused_gating_intermediate.group_size = group_size;
ffn_weights.fused_gating_intermediate.w4_weight_layout = w4_weight_layout;
ffn_weights.fused_gating_intermediate.w4_pad_size = w4_pad_size;
ffn_weights.output.input_dims = inter_size_ / tensor_para_size_;
ffn_weights.output.output_dims = hidden_units_;
ffn_weights.output.type = weight_type;
ffn_weights.output.group_size = group_size;
ffn_weights.output.w4_weight_layout = w4_weight_layout;
ffn_weights.output.w4_pad_size = w4_pad_size;
mallocWeights();
}
......@@ -118,16 +125,9 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
else { // int8, int4
const int factor = sizeof(float) * 8 / bit_size;
FT_CHECK(weights.input_dims % factor == 0);
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if((weights.input_dims%4096==0)&&(weights.w4_weight_layout==1||weights.w4_weight_layout==2))
{
size_t new_input_dims=weights.input_dims+2*weights.group_size;
size_t new_input_dims=weights.input_dims+weights.w4_pad_size*weights.group_size;
deviceMalloc((int**)&weights.kernel, new_input_dims * weights.output_dims / factor);
deviceMemSetZero((int*)weights.kernel, new_input_dims* weights.output_dims / factor);
......@@ -171,15 +171,10 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
}
else { // int8, int4
const int factor = sizeof(float) * 8 / bit_size;
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if((weights.input_dims%4096==0)&&(weights.w4_weight_layout==1||weights.w4_weight_layout==2))
{
size_t new_input_dims=weights.input_dims+weights.group_size;
size_t new_input_dims=weights.input_dims+ weights.w4_pad_size*weights.group_size;
output.insert(get_name("qweight"),
Tensor{MEMORY_GPU,
......@@ -189,7 +184,7 @@ void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string&
output.insert(get_name("scales_zeros"),
Tensor{MEMORY_GPU,
getTensorType<T>(),
{new_input_dims / weights.group_size * weights.output_dims * 2 * sizeof(T)},
{new_input_dims * weights.output_dims/ weights.group_size * 2 * sizeof(T)},
weights.scales_and_zeros});
}
else{
......@@ -307,23 +302,36 @@ void loadWeights(LlamaDenseWeight<T>& w,
FT_CHECK(dim1 % factor == 0);
// //读环境变量
// int m_weightlayout_switch=1;
// const char* env_weightlayout_str = std::getenv("LMDEPLOY_WEIGHTLAYOUT_SWITCH");
// if (env_weightlayout_str != nullptr) {
// m_weightlayout_switch = std::stoi(env_weightlayout_str);
// }
if((dim0%4096==0)&&(w.w4_weight_layout==1||w.w4_weight_layout==2))
if(w.w4_weight_layout==1||w.w4_weight_layout==2)//需要转置
{
size_t new_dim0=dim0+2*w.group_size;
std::vector<size_t> w_shape{new_dim0, dim1 / factor * sizeof(uint32_t)};
loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8, {});
const size_t group_count = w.group_size > 0 ? new_dim0 / w.group_size : 1;
loadWeightFromBin((half*)w.scales_and_zeros, {group_count, dim1 * 2}, prefix + ".scales_zeros", type, {});
size_t new_dim0=dim0;
if(dim0%4096==0) new_dim0=dim0+w.w4_pad_size*w.group_size;
//申请内存
int* kernel_workspace=nullptr;
half* scales_workspace=nullptr;
deviceMalloc((int**)&kernel_workspace, new_dim0 * dim1 / factor);
deviceMemSetZero((int*)kernel_workspace, new_dim0* dim1 / factor);
deviceMalloc((half**)&scales_workspace, new_dim0 / w.group_size * dim1 * 2);
//加载weight
std::vector<size_t> w_shape{dim0, dim1 / factor * sizeof(uint32_t)};
loadWeightFromBin((int8_t*)kernel_workspace, w_shape, prefix + ".qweight", FtCudaDataType::INT8, {});
const size_t group_count = w.group_size > 0 ? dim0 / w.group_size : 1;
loadWeightFromBin((half*)scales_workspace, {group_count, dim1 * 2}, prefix + ".scales_zeros", type, {});
//转置
reformat_s4_k_m8_tarnsw4((uint32_t*)w.kernel,(uint32_t*)kernel_workspace,dim1,new_dim0,0);
reformat_s4_k_m8_tarnsscale((uint32_t*)w.scales_and_zeros,(uint32_t*)scales_workspace,dim1,new_dim0/w.group_size,0);
//释放内存
cudaFree(kernel_workspace);
cudaFree(scales_workspace);
kernel_workspace = nullptr;
scales_workspace = nullptr;
}
else{
std::vector<size_t> w_shape{dim0, dim1 / factor * sizeof(uint32_t)};
loadWeightFromBin((int8_t*)w.kernel, w_shape, prefix + ".qweight", FtCudaDataType::INT8, {});
......@@ -332,9 +340,57 @@ void loadWeights(LlamaDenseWeight<T>& w,
loadWeightFromBin((half*)w.scales_and_zeros, {group_count, dim1 * 2}, prefix + ".scales_zeros", type, {});
}
//在这里进行weight的pad以及转置
}
}
template<typename T>
void transWeights(LlamaDenseWeight<T>& w,
FtCudaDataType model_file_type)
{
const auto type = model_file_type;
size_t dim0 = w.input_dims;
size_t dim1 = w.output_dims;
const size_t bit_size = getBitSize(w.type);
const int factor = sizeof(float) * 8 / bit_size;
FT_CHECK(dim1 % factor == 0);
if(w.w4_weight_layout==1||w.w4_weight_layout==2)//需要转置
{
size_t new_dim0=dim0;
if(dim0%4096==0) new_dim0=dim0+w.w4_pad_size*w.group_size;
//申请内存
int* kernel_workspace=nullptr;
half* scales_workspace=nullptr;
deviceMalloc((int**)&kernel_workspace, new_dim0 * dim1 / factor);
deviceMemSetZero((int*)kernel_workspace, new_dim0* dim1 / factor);
deviceMalloc((half**)&scales_workspace, new_dim0 / w.group_size * dim1 * 2);
deviceMemSetZero((half*)scales_workspace, new_dim0 / w.group_size * dim1 * 2);
//拷贝加载weight
cudaD2Dcpy((int*)kernel_workspace,(int*)w.kernel, dim0* dim1 / factor);
cudaD2Dcpy((half*)scales_workspace,(half*)w.scales_and_zeros, dim0 / w.group_size * dim1 * 2);
//转置
reformat_s4_k_m8_tarnsw4((uint32_t*)w.kernel,(uint32_t*)kernel_workspace,dim1,new_dim0,0);
reformat_s4_k_m8_tarnsscale((uint32_t*)w.scales_and_zeros,(uint32_t*)scales_workspace,dim1,new_dim0/w.group_size,0);
//释放内存
cudaFree(kernel_workspace);
cudaFree(scales_workspace);
kernel_workspace = nullptr;
scales_workspace = nullptr;
}
}
template<typename T>
void LlamaDecoderLayerWeight<T>::mallocWeights()
{
......@@ -420,6 +476,19 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
}
}
template<typename T>
void LlamaDecoderLayerWeight<T>::modifyModel(FtCudaDataType model_file_type)
{
const auto rank_spec = std::to_string(tensor_para_rank_);
const auto type = model_file_type;
transWeights(self_attn_weights.qkv,type);
transWeights(self_attn_weights.output,type);
transWeights(ffn_weights.fused_gating_intermediate,type);
transWeights(ffn_weights.output,type);
}
template<typename T>
TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
{
......
......@@ -36,6 +36,7 @@ public:
WeightType weight_type,
int group_size,
int w4_weight_layout,
int w4_pad_size,
bool attn_bias,
size_t tensor_para_size,
size_t tensor_para_rank);
......@@ -44,6 +45,7 @@ public:
LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
void loadModel(std::string dir_path, FtCudaDataType model_file_type);
void modifyModel(FtCudaDataType model_file_type);
TensorMap getParams(std::string prefix);
......
......@@ -64,6 +64,7 @@ struct LlamaDenseWeight {
T* scales_and_zeros;
int group_size;
int w4_weight_layout;
int w4_pad_size;
};
template<typename T>
......
......@@ -117,19 +117,19 @@ private:
//检查xpad空间是否足够
if(weight.input_dims%4096==0) //需要进行pad
{
int pad_group_count=2;
input_padding(stream_,reinterpret_cast<half*>(cublas_wrapper_->xpading_workspace_),(const T*)input_data,batch_size,weight.input_dims,weight.group_size,pad_group_count);
dequant_w4_gemm_colmajor(stream_,reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_),(const uint32_t*)weight.kernel,(const half2*)weight.scales_and_zeros,weight.input_dims+pad_group_count*weight.group_size ,weight.output_dims,weight.group_size);
input_padding(stream_,reinterpret_cast<half*>(cublas_wrapper_->xpading_workspace_),(const T*)input_data,batch_size,weight.input_dims,weight.group_size,weight.w4_pad_size);
dequant_w4_gemm_colmajor(stream_,reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_),(const uint32_t*)weight.kernel,(const half2*)weight.scales_and_zeros,weight.input_dims+weight.w4_pad_size*weight.group_size ,weight.output_dims,weight.group_size);
cublas_wrapper_->Gemm(CUBLAS_OP_T,
CUBLAS_OP_N,
weight.output_dims,//m
batch_size,//n
weight.input_dims+pad_group_count*weight.group_size,//k
weight.input_dims+weight.w4_pad_size*weight.group_size,//k
(const T*) reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_), //[]
weight.input_dims+pad_group_count*weight.group_size, //k
weight.input_dims+weight.w4_pad_size*weight.group_size, //k
(const T*) cublas_wrapper_->xpading_workspace_,
weight.input_dims+pad_group_count*weight.group_size, //k
weight.input_dims+weight.w4_pad_size*weight.group_size, //k
output_data,
weight.output_dims); //m
}
......@@ -155,8 +155,7 @@ private:
//检查ck workspace 的空间是否足够
if(weight.input_dims%4096==0)
{
int pad_groupcount=2;
run_weight_only_gemm(reinterpret_cast<const void*>(input_data), reinterpret_cast<const void*>(weight.kernel), reinterpret_cast<const void*>(weight.scales_and_zeros), reinterpret_cast<void*> (output_data), batch_size, weight.output_dims, (weight.input_dims), (weight.input_dims),(weight.input_dims), (weight.input_dims+pad_groupcount*weight.group_size), weight.output_dims, weight.group_size,reinterpret_cast<void*>(cublas_wrapper_->ck_workspace_),CK_WORKSPACE_SIZE,(hipStream_t)stream_);
run_weight_only_gemm(reinterpret_cast<const void*>(input_data), reinterpret_cast<const void*>(weight.kernel), reinterpret_cast<const void*>(weight.scales_and_zeros), reinterpret_cast<void*> (output_data), batch_size, weight.output_dims, (weight.input_dims), (weight.input_dims),(weight.input_dims), (weight.input_dims+weight.w4_pad_size*weight.group_size), weight.output_dims, weight.group_size,reinterpret_cast<void*>(cublas_wrapper_->ck_workspace_),CK_WORKSPACE_SIZE,(hipStream_t)stream_);
}
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
else{
......@@ -208,19 +207,19 @@ private:
//检查xpad空间是否足够
if(weight.input_dims%4096==0) //需要进行pad
{
int pad_group_count=2;
input_padding<T>(stream_,reinterpret_cast<half*>(cublas_wrapper_->xpading_workspace_),(const T*)input_data,batch_size,weight.input_dims,weight.group_size,pad_group_count);
dequant_w4_gemm_colmajor(stream_,reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_),(const uint32_t*)weight.kernel,(const half2*)weight.scales_and_zeros,weight.input_dims+pad_group_count*weight.group_size,weight.output_dims,weight.group_size);
input_padding<T>(stream_,reinterpret_cast<half*>(cublas_wrapper_->xpading_workspace_),(const T*)input_data,batch_size,weight.input_dims,weight.group_size,weight.w4_pad_size);
dequant_w4_gemm_colmajor(stream_,reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_),(const uint32_t*)weight.kernel,(const half2*)weight.scales_and_zeros,weight.input_dims+weight.w4_pad_size*weight.group_size,weight.output_dims,weight.group_size);
cublas_wrapper_->Gemm(CUBLAS_OP_T,
CUBLAS_OP_N,
weight.output_dims,//m
batch_size,//n
weight.input_dims+pad_group_count*weight.group_size,//k
weight.input_dims+weight.w4_pad_size*weight.group_size,//k
(const T*) reinterpret_cast<T*>(cublas_wrapper_->deweight_workspace_), //[]
weight.input_dims+pad_group_count*weight.group_size, //k
weight.input_dims+weight.w4_pad_size*weight.group_size, //k
(const T*) cublas_wrapper_->xpading_workspace_,
weight.input_dims+pad_group_count*weight.group_size, //k
weight.input_dims+weight.w4_pad_size*weight.group_size, //k
output_tmp,
weight.output_dims); //m
}
......@@ -246,8 +245,8 @@ private:
if(weight.input_dims%4096==0)
{
int pad_groupcount=2;
run_weight_only_gemm(reinterpret_cast<const void*>(input_data), reinterpret_cast<const void*>(weight.kernel), reinterpret_cast<const void*>(weight.scales_and_zeros), reinterpret_cast<void*> (output_tmp), batch_size, weight.output_dims, (weight.input_dims), (weight.input_dims),(weight.input_dims), (weight.input_dims+pad_groupcount*weight.group_size), weight.output_dims, weight.group_size,reinterpret_cast<void*>(cublas_wrapper_->ck_workspace_),CK_WORKSPACE_SIZE,(hipStream_t)stream_);
run_weight_only_gemm(reinterpret_cast<const void*>(input_data), reinterpret_cast<const void*>(weight.kernel), reinterpret_cast<const void*>(weight.scales_and_zeros), reinterpret_cast<void*> (output_tmp), batch_size, weight.output_dims, (weight.input_dims), (weight.input_dims),(weight.input_dims), (weight.input_dims+weight.w4_pad_size*weight.group_size), weight.output_dims, weight.group_size,reinterpret_cast<void*>(cublas_wrapper_->ck_workspace_),CK_WORKSPACE_SIZE,(hipStream_t)stream_);
}
// A B0 B1 C M N K strideA strideB strideBpad strideC group_size
else{
......
......@@ -33,6 +33,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
WeightType weight_type,
int group_size,
int w4_weight_layout,
int w4_pad_size,
size_t tensor_para_size,
size_t tensor_para_rank):
hidden_units_(head_num * size_per_head),
......@@ -57,6 +58,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
weight_type_,
group_size,
w4_weight_layout,
w4_pad_size,
attn_bias,
tensor_para_size_,
tensor_para_rank_));
......@@ -69,7 +71,7 @@ LlamaWeight<T>::LlamaWeight(size_t head_num,
std::string str_w4_weight_layout = std::to_string(w4_weight_layout);
const char* env_value = str_w4_weight_layout.c_str();
setenv(env_name,env_value , 1);
//printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout);
}
else
{
......@@ -128,8 +130,23 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
for (unsigned layer = 0; layer < num_layer_; ++layer) {
decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
}
}
template<typename T>
void LlamaWeight<T>::modifyModel()
{
FtCudaDataType model_file_type = FtCudaDataType::FP16;
if(weight_type_ == WeightType::kBF16){
model_file_type = FtCudaDataType::BF16;
}
for (unsigned layer = 0; layer < num_layer_; ++layer) {
decoder_layer_weights[layer]->modifyModel(model_file_type);
}
}
template<typename T>
TensorMap LlamaWeight<T>::getParams()
{
......
......@@ -38,6 +38,7 @@ struct LlamaWeight {
WeightType weight_type,
int group_size,
int w4_weight_layout,
int w4_pad_size,
size_t tensor_para_size,
size_t tensor_para_rank);
......@@ -47,6 +48,7 @@ struct LlamaWeight {
LlamaWeight& operator=(const LlamaWeight& other) = delete;
void loadModel(std::string dir_path);
void modifyModel();
TensorMap getParams();
......
......@@ -439,6 +439,11 @@ PYBIND11_MODULE(_turbomind, m)
py::call_guard<py::gil_scoped_release>(),
"device_id"_a,
"rank"_a)
.def("modify_shared_weights",
&AbstractTransformerModel::modifySharedWeights,
py::call_guard<py::gil_scoped_release>(),
"device_id"_a,
"rank"_a)
.def(
"get_params",
[](AbstractTransformerModel* model, int deviceId, int rank) {
......
......@@ -187,6 +187,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
quant_policy_ = reader.GetInteger("llama", "quant_policy", 0);
group_size_ = reader.GetInteger("llama", "group_size", 0);
w4_weight_layout_ = reader.GetInteger("llama", "w4_weight_layout", 2);
w4_pad_size_ = reader.GetInteger("llama", "w4_pad_size", 2);
// rotary embedding parameters
attn_params_.rotary_embedding_dim = reader.GetInteger("llama", "rotary_embedding");
......@@ -383,6 +384,7 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
weight_type_,
group_size_,
w4_weight_layout_,
w4_pad_size_,
tensor_para_size_,
tensor_para_rank);
// model inited with model_dir
......@@ -392,6 +394,21 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
return;
}
template<typename T>
void LlamaTritonModel<T>::modifySharedWeights(int device_id, int rank)
{
ft::check_cuda_error(cudaSetDevice(device_id));
const int tensor_para_rank = rank % tensor_para_size_;
const int pipeline_para_rank = rank / tensor_para_size_;
ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
if(weight_type_== turbomind::WeightType::kINT4)
{
shared_weights_[device_id]->modifyModel();
}
return;
}
template<typename T>
TensorMap LlamaTritonModel<T>::getParams(int deviceId, int rank)
{
......
......@@ -53,6 +53,7 @@ struct LlamaTritonModel: public AbstractTransformerModel {
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
void createSharedWeights(int deviceId, int rank) override;
void modifySharedWeights(int deviceId, int rank) override;
TensorMap getParams(int deviceId, int rank) override;
......@@ -102,6 +103,7 @@ private:
int quant_policy_;
int group_size_;
int w4_weight_layout_;
int w4_pad_size_;
// shared weights for each device
std::vector<std::shared_ptr<ft::LlamaWeight<T>>> shared_weights_;
......
......@@ -325,6 +325,7 @@ struct AbstractTransformerModel {
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) = 0;
virtual void createSharedWeights(int deviceId, int rank) = 0;
virtual void modifySharedWeights(int deviceId, int rank) = 0;
virtual TensorMap getParams(int deviceId, int rank) = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment