Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
0fbfc4b8
Unverified
Commit
0fbfc4b8
authored
Dec 15, 2023
by
CHU Tianxiang
Committed by
GitHub
Dec 15, 2023
Browse files
Add GPTQ support (#916)
parent
c06170cc
Changes
35
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
98 additions
and
30 deletions
+98
-30
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+3
-0
vllm/model_executor/models/falcon.py
vllm/model_executor/models/falcon.py
+22
-17
vllm/model_executor/models/gpt2.py
vllm/model_executor/models/gpt2.py
+0
-1
vllm/model_executor/models/gpt_j.py
vllm/model_executor/models/gpt_j.py
+8
-1
vllm/model_executor/models/gpt_neox.py
vllm/model_executor/models/gpt_neox.py
+0
-1
vllm/model_executor/models/internlm.py
vllm/model_executor/models/internlm.py
+8
-1
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+8
-1
vllm/model_executor/models/mistral.py
vllm/model_executor/models/mistral.py
+8
-1
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral.py
+9
-2
vllm/model_executor/models/mpt.py
vllm/model_executor/models/mpt.py
+3
-0
vllm/model_executor/models/opt.py
vllm/model_executor/models/opt.py
+8
-1
vllm/model_executor/models/phi_1_5.py
vllm/model_executor/models/phi_1_5.py
+3
-0
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+8
-2
vllm/model_executor/models/yi.py
vllm/model_executor/models/yi.py
+8
-1
vllm/model_executor/weight_utils.py
vllm/model_executor/weight_utils.py
+2
-1
No files found.
vllm/model_executor/models/chatglm.py
View file @
0fbfc4b8
...
...
@@ -377,6 +377,9 @@ class ChatGLMForCausalLM(nn.Module):
continue
if
"word_embeddings"
in
name
:
name
=
name
.
replace
(
".word_embeddings"
,
""
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/falcon.py
View file @
0fbfc4b8
...
...
@@ -425,27 +425,32 @@ class FalconForCausalLM(nn.Module):
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
if
"query_key_value"
in
name
:
output_dim
=
getattr
(
param
,
"output_dim"
,
None
)
loaded_weight_shape
=
loaded_weight
.
shape
loaded_weight
=
loaded_weight
.
view
(
loaded_weight_shape
[:
output_dim
]
+
(
total_num_kv_heads
,
num_query_heads_per_kv_head
+
2
,
-
1
)
+
loaded_weight_shape
[
output_dim
+
1
:])
wq
=
loaded_weight
.
narrow
(
output_dim
+
1
,
0
,
num_query_heads_per_kv_head
).
reshape
(
*
loaded_weight_shape
[:
output_dim
],
-
1
,
*
loaded_weight_shape
[
output_dim
+
1
:])
wk
=
loaded_weight
.
narrow
(
output_dim
+
1
,
num_query_heads_per_kv_head
,
1
).
reshape
(
*
loaded_weight_shape
[:
output_dim
],
-
1
,
*
loaded_weight_shape
[
output_dim
+
1
:])
wv
=
loaded_weight
.
narrow
(
output_dim
+
1
,
num_query_heads_per_kv_head
+
1
,
1
).
reshape
(
*
loaded_weight_shape
[:
output_dim
],
-
1
,
*
loaded_weight_shape
[
output_dim
+
1
:])
loaded_weight
=
torch
.
cat
([
wq
,
wk
,
wv
],
dim
=
output_dim
)
if
output_dim
is
not
None
:
loaded_weight
=
loaded_weight
.
view
(
loaded_weight_shape
[:
output_dim
]
+
(
total_num_kv_heads
,
num_query_heads_per_kv_head
+
2
,
-
1
)
+
loaded_weight_shape
[
output_dim
+
1
:])
wq
=
loaded_weight
.
narrow
(
output_dim
+
1
,
0
,
num_query_heads_per_kv_head
).
reshape
(
*
loaded_weight_shape
[:
output_dim
],
-
1
,
*
loaded_weight_shape
[
output_dim
+
1
:])
wk
=
loaded_weight
.
narrow
(
output_dim
+
1
,
num_query_heads_per_kv_head
,
1
).
reshape
(
*
loaded_weight_shape
[:
output_dim
],
-
1
,
*
loaded_weight_shape
[
output_dim
+
1
:])
wv
=
loaded_weight
.
narrow
(
output_dim
+
1
,
num_query_heads_per_kv_head
+
1
,
1
).
reshape
(
*
loaded_weight_shape
[:
output_dim
],
-
1
,
*
loaded_weight_shape
[
output_dim
+
1
:])
loaded_weight
=
torch
.
cat
([
wq
,
wk
,
wv
],
dim
=
output_dim
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/gpt2.py
View file @
0fbfc4b8
...
...
@@ -275,7 +275,6 @@ class GPT2LMHeadModel(nn.Module):
if
not
name
.
endswith
(
".weight"
):
continue
loaded_weight
=
loaded_weight
.
t
()
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
vllm/model_executor/models/gpt_j.py
View file @
0fbfc4b8
...
...
@@ -274,11 +274,18 @@ class GPTJForCausalLM(nn.Module):
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/gpt_neox.py
View file @
0fbfc4b8
...
...
@@ -72,7 +72,6 @@ class GPTNeoXAttention(nn.Module):
config
.
hidden_size
,
linear_method
=
linear_method
,
)
scaling
=
self
.
head_size
**-
0.5
rotary_dim
=
int
(
self
.
head_size
*
config
.
rotary_pct
)
assert
rotary_dim
%
2
==
0
...
...
vllm/model_executor/models/internlm.py
View file @
0fbfc4b8
...
...
@@ -289,11 +289,18 @@ class InternLMForCausalLM(nn.Module):
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/llama.py
View file @
0fbfc4b8
...
...
@@ -330,11 +330,18 @@ class LlamaForCausalLM(nn.Module):
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/mistral.py
View file @
0fbfc4b8
...
...
@@ -321,11 +321,18 @@ class MistralForCausalLM(nn.Module):
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/mixtral.py
View file @
0fbfc4b8
...
...
@@ -153,7 +153,7 @@ class MixtralMoE(nn.Module):
self
.
gate
=
ReplicatedLinear
(
config
.
hidden_size
,
self
.
num_total_experts
,
bias
=
False
,
linear_method
=
linear_method
)
linear_method
=
None
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
batch_size
,
sequence_length
,
hidden_dim
=
hidden_states
.
shape
...
...
@@ -418,11 +418,18 @@ class MixtralForCausalLM(nn.Module):
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/mpt.py
View file @
0fbfc4b8
...
...
@@ -297,6 +297,9 @@ class MPTForCausalLM(nn.Module):
params_dict
=
dict
(
self
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
hf_model_weights_iterator
(
model_name_or_path
,
cache_dir
,
load_format
,
revision
):
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/opt.py
View file @
0fbfc4b8
...
...
@@ -345,11 +345,18 @@ class OPTForCausalLM(nn.Module):
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/phi_1_5.py
View file @
0fbfc4b8
...
...
@@ -305,6 +305,9 @@ class PhiForCausalLM(nn.Module):
if
"rotary_emb.inv_freq"
in
name
:
continue
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
# pylint: disable=E1136
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
...
...
vllm/model_executor/models/qwen.py
View file @
0fbfc4b8
...
...
@@ -82,7 +82,6 @@ class QWenAttention(nn.Module):
self
.
num_heads
=
(
self
.
total_num_heads
//
tensor_model_parallel_world_size
)
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
c_attn
=
QKVParallelLinear
(
hidden_size
,
self
.
head_dim
,
...
...
@@ -279,11 +278,18 @@ class QWenLMHeadModel(nn.Module):
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/models/yi.py
View file @
0fbfc4b8
...
...
@@ -320,11 +320,18 @@ class YiForCausalLM(nn.Module):
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
...
...
vllm/model_executor/weight_utils.py
View file @
0fbfc4b8
...
...
@@ -287,4 +287,5 @@ def initialize_dummy_weights(
values between -1e-3 and 1e-3 works well for most models.
"""
for
param
in
model
.
state_dict
().
values
():
param
.
data
.
uniform_
(
low
,
high
)
if
torch
.
is_floating_point
(
param
):
param
.
data
.
uniform_
(
low
,
high
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment