Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1e0cb1f4
"vllm/vscode:/vscode.git/clone" did not exist on "06dd08256f076689945418cd61397c1759f4abfa"
Commit
1e0cb1f4
authored
Jul 20, 2024
by
zhuwenwen
Browse files
support nn layout
parent
281ca6c1
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
62 additions
and
49 deletions
+62
-49
csrc/ops.h
csrc/ops.h
+2
-1
csrc/quantization/gptq/q_gemm.cu
csrc/quantization/gptq/q_gemm.cu
+1
-1
csrc/torch_bindings.cpp
csrc/torch_bindings.cpp
+1
-1
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+1
-4
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+1
-5
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+23
-2
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+23
-2
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+3
-10
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+5
-12
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+2
-11
No files found.
csrc/ops.h
View file @
1e0cb1f4
...
...
@@ -119,7 +119,8 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
void
gptq_shuffle
(
torch
::
Tensor
q_weight
,
torch
::
Tensor
q_perm
,
int64_t
bit
);
void
trans_w16_gemm
(
torch
::
Tensor
dst
,
torch
::
Tensor
src
,
int64_t
row
,
int64_t
col
);
void
trans_w16_gemm
(
torch
::
Tensor
dst
,
torch
::
Tensor
src
,
int64_t
row
,
int64_t
col
);
// void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
// torch::Tensor& scale);
...
...
csrc/quantization/gptq/q_gemm.cu
View file @
1e0cb1f4
...
...
@@ -1548,7 +1548,7 @@ __global__ void trans_w16_gemm_cudakernel(int64_t num_kernels,T* dst,const T* sr
int64_t
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
id
>=
num_kernels
)
return
;
int64_t
j
=
id
%
row
;
//dst的列id
int64_t
j
=
id
%
row
;
int64_t
i
=
id
/
row
;
dst
[
i
*
row
+
j
]
=
src
[
j
*
col
+
i
];
...
...
csrc/torch_bindings.cpp
View file @
1e0cb1f4
...
...
@@ -160,7 +160,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops
.
impl
(
"gptq_shuffle"
,
torch
::
kCUDA
,
&
gptq_shuffle
);
// trans w16
ops
.
def
(
"trans_w16_gemm(Tensor! dst, Tensor src, int row,int col) -> ()"
);
ops
.
def
(
"trans_w16_gemm(Tensor! dst, Tensor src, int row,
int col) -> ()"
);
ops
.
impl
(
"trans_w16_gemm"
,
torch
::
kCUDA
,
&
trans_w16_gemm
);
// Quantized GEMM for SqueezeLLM.
...
...
vllm/model_executor/layers/linear.py
View file @
1e0cb1f4
...
...
@@ -89,7 +89,6 @@ class UnquantizedLinearMethod(LinearMethodBase):
self
.
separate_bias_add
=
separate_bias_add
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
input_size_per_partition
:
int
,
output_partition_sizes
:
List
[
int
],
input_size
:
int
,
...
...
@@ -110,15 +109,13 @@ class UnquantizedLinearMethod(LinearMethodBase):
weight
=
layer
.
weight
if
self
.
separate_bias_add
:
if
bias
is
not
None
:
return
F
.
linear
(
x
,
weight
)
+
bias
return
F
.
linear
(
x
,
weight
)
if
self
.
use_llama_nn
:
if
bias
is
not
None
:
return
torch
.
matmul
(
x
,
weight
)
+
bias
return
torch
.
addmm
(
bias
,
x
,
weight
)
else
:
return
torch
.
matmul
(
x
,
weight
)
else
:
...
...
vllm/model_executor/model_loader/utils.py
View file @
1e0cb1f4
...
...
@@ -22,13 +22,9 @@ def set_default_torch_dtype(dtype: torch.dtype):
def
get_model_architecture
(
model_config
:
ModelConfig
)
->
Tuple
[
Type
[
nn
.
Module
],
str
]:
architectures
=
getattr
(
model_config
.
hf_config
,
"architectures"
,
[])
if
architectures
==
[
'LlamaForCausalLM'
]
or
architectures
==
[
'Qwen2ForCausalLM'
]
or
architectures
==
[
'ChatGLMModel'
]
or
architectures
==
[
'BaichuanForCausalLM'
]:
if
architectures
==
[
'LlamaForCausalLM'
]
or
architectures
==
[
'QWenLMHeadModel'
]
or
architectures
==
[
'Qwen2ForCausalLM'
]
or
architectures
==
[
'ChatGLMModel'
]
or
architectures
==
[
'BaichuanForCausalLM'
]:
if
os
.
getenv
(
'LLAMA_NN'
)
!=
'0'
:
os
.
environ
[
'LLAMA_NN'
]
=
'1'
if
os
.
getenv
(
'GEMM_PAD'
)
!=
'1'
:
os
.
environ
[
'GEMM_PAD'
]
=
'0'
if
os
.
getenv
(
'FA_PAD'
)
!=
'1'
:
os
.
environ
[
'FA_PAD'
]
=
'0'
else
:
os
.
environ
[
'LLAMA_NN'
]
=
'0'
# Special handling for quantized Mixtral.
...
...
vllm/model_executor/models/baichuan.py
View file @
1e0cb1f4
...
...
@@ -25,6 +25,7 @@ import torch
from
torch
import
nn
from
transformers
import
PretrainedConfig
import
os
import
re
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
LoRAConfig
...
...
@@ -45,6 +46,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
SamplerOutput
from
vllm
import
_custom_ops
as
ops
def
_get_alibi_slopes
(
total_num_heads
:
int
)
->
torch
.
Tensor
:
...
...
@@ -179,8 +181,6 @@ class BaiChuanAttention(nn.Module):
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
W_pack
(
hidden_states
)
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
and
qkv
.
shape
[
-
1
]
==
12320
:
qkv
=
qkv
[...,:
-
32
]
q
,
k
,
v
=
qkv
.
chunk
(
chunks
=
3
,
dim
=-
1
)
if
self
.
postion_embedding
!=
"ALIBI"
:
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
...
...
@@ -329,6 +329,7 @@ class BaiChuanBaseForCausalLM(nn.Module):
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
Sampler
()
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
def
forward
(
self
,
...
...
@@ -396,6 +397,26 @@ class BaiChuanBaseForCausalLM(nn.Module):
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
if
self
.
use_llama_nn
:
lay_key_words
=
[
"self_attn.W_pack.weight"
,
"self_attn.o_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.down_proj.weight"
]
combined_words
=
"|"
.
join
(
lay_key_words
)
for
layername
,
weight
in
params_dict
.
items
():
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
ops
.
trans_w16_gemm
(
_weight
,
weight
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
class
BaichuanForCausalLM
(
BaiChuanBaseForCausalLM
):
...
...
vllm/model_executor/models/chatglm.py
View file @
1e0cb1f4
...
...
@@ -8,6 +8,7 @@ import torch
from
torch
import
nn
from
torch.nn
import
LayerNorm
import
os
import
re
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
LoRAConfig
...
...
@@ -28,6 +29,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
SamplerOutput
from
vllm.transformers_utils.configs
import
ChatGLMConfig
from
vllm
import
_custom_ops
as
ops
class
GLMAttention
(
nn
.
Module
):
...
...
@@ -102,8 +104,6 @@ class GLMAttention(nn.Module):
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
query_key_value
(
hidden_states
)
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
and
qkv
.
shape
[
-
1
]
==
12320
:
qkv
=
qkv
[...,:
-
32
]
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
position_ids
,
q
,
k
)
context_layer
=
self
.
attn
(
...
...
@@ -356,6 +356,7 @@ class ChatGLMForCausalLM(nn.Module):
self
.
lm_head_weight
=
self
.
transformer
.
output_layer
.
weight
self
.
logits_processor
=
LogitsProcessor
(
config
.
padded_vocab_size
)
self
.
sampler
=
Sampler
()
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
def
forward
(
self
,
...
...
@@ -396,3 +397,23 @@ class ChatGLMForCausalLM(nn.Module):
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
if
self
.
use_llama_nn
:
lay_key_words
=
[
"self_attention.query_key_value.weight"
,
"self_attention.dense.weight"
,
"mlp.dense_h_to_4h.weight"
,
"mlp.dense_4h_to_h.weight"
]
combined_words
=
"|"
.
join
(
lay_key_words
)
for
layername
,
weight
in
params_dict
.
items
():
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
ops
.
trans_w16_gemm
(
_weight
,
weight
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
vllm/model_executor/models/llama.py
View file @
1e0cb1f4
...
...
@@ -159,8 +159,6 @@ class LlamaAttention(nn.Module):
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
and
qkv
.
shape
[
-
1
]
==
12320
:
qkv
=
qkv
[...,:
-
32
]
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
attn_output
=
self
.
attn
(
q
,
k
,
v
,
kv_cache
,
attn_metadata
)
...
...
@@ -444,29 +442,24 @@ class LlamaForCausalLM(nn.Module):
weight_loader
(
param
,
loaded_weight
)
if
self
.
use_llama_nn
:
#以上代码模型权重已经加载完了
#以下代码使用正则匹配来找出要修改的weight
lay_key_words
=
[
"self_attn.qkv_proj.weight"
,
"self_attn.o_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.down_proj.weight"
]
#合并所有关键词为一个正则表达式
combined_words
=
"|"
.
join
(
lay_key_words
)
for
layername
,
weight
in
params_dict
.
items
():
#print("key:\n",key)
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
#创建一个跟value一样大的tensor
if
matches
:
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
ops
.
trans_w16_gemm
(
_weight
,
weight
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
ops
.
trans_w16_gemm
(
_weight
,
weight
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
# If this function is called, it should always initialize KV cache scale
# factors (or else raise an exception). Thus, handled exceptions should
...
...
vllm/model_executor/models/qwen.py
View file @
1e0cb1f4
...
...
@@ -298,28 +298,21 @@ class QWenLMHeadModel(nn.Module):
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
if
self
.
use_llama_nn
:
#以上代码模型权重已经加载完了
#以下代码使用正则匹配来找出要修改的weight
lay_key_words
=
[
"
self_attn.qkv_proj
.weight"
,
"
self_
attn.
o
_proj.weight"
,
"
attn.c_attn
.weight"
,
"attn.
c
_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.
down
_proj.weight"
"mlp.
c
_proj.weight"
]
#合并所有关键词为一个正则表达式
combined_words
=
"|"
.
join
(
lay_key_words
)
for
layername
,
weight
in
params_dict
.
items
():
#print("key:\n",key)
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
#print(layername)
# print(weight.data)
#创建一个跟value一样大的tensor
if
matches
:
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
ops
.
trans_w16_gemm
(
_weight
,
weight
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
ops
.
trans_w16_gemm
(
_weight
,
weight
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
...
...
vllm/model_executor/models/qwen2.py
View file @
1e0cb1f4
...
...
@@ -150,8 +150,6 @@ class Qwen2Attention(nn.Module):
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
if
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
and
qkv
.
shape
[
-
1
]
==
12320
:
qkv
=
qkv
[...,:
-
32
]
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
attn_output
=
self
.
attn
(
q
,
k
,
v
,
kv_cache
,
attn_metadata
)
...
...
@@ -386,28 +384,21 @@ class Qwen2ForCausalLM(nn.Module):
weight_loader
(
param
,
loaded_weight
)
if
self
.
use_llama_nn
:
#以上代码模型权重已经加载完了
#以下代码使用正则匹配来找出要修改的weight
lay_key_words
=
[
"self_attn.qkv_proj.weight"
,
"self_attn.o_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.down_proj.weight"
]
#合并所有关键词为一个正则表达式
combined_words
=
"|"
.
join
(
lay_key_words
)
for
layername
,
weight
in
params_dict
.
items
():
#print("key:\n",key)
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
#print(layername)
# print(weight.data)
#创建一个跟value一样大的tensor
if
matches
:
_weight
=
torch
.
zeros_like
(
weight
.
data
)
ori_shape
=
_weight
.
shape
ops
.
trans_w16_gemm
(
_weight
,
weight
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
ops
.
trans_w16_gemm
(
_weight
,
weight
.
data
,
_weight
.
shape
[
0
],
_weight
.
shape
[
1
])
weight
.
data
.
copy_
(
_weight
)
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment