Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
112588c2
Commit
112588c2
authored
Oct 28, 2024
by
zhuwenwen
Browse files
update lm_head of llama
parent
fa5b0b39
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
8 deletions
+12
-8
vllm/model_executor/layers/vocab_parallel_embedding.py
vllm/model_executor/layers/vocab_parallel_embedding.py
+2
-2
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+4
-4
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+6
-2
No files found.
vllm/model_executor/layers/vocab_parallel_embedding.py
View file @
112588c2
...
@@ -22,7 +22,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
...
@@ -22,7 +22,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
def
__init__
(
self
):
def
__init__
(
self
):
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
self
.
use_lm_
t
n
=
os
.
environ
.
get
(
'LM_
T
N'
)
==
'1'
self
.
use_lm_
n
n
=
os
.
environ
.
get
(
'LM_
N
N'
)
==
'1'
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
input_size_per_partition
:
int
,
input_size_per_partition
:
int
,
...
@@ -42,7 +42,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
...
@@ -42,7 +42,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
layer
:
torch
.
nn
.
Module
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
if
self
.
use_llama_nn
and
not
self
.
use_lm_
t
n
:
if
self
.
use_llama_nn
and
self
.
use_lm_
n
n
:
if
bias
is
not
None
:
if
bias
is
not
None
:
if
len
(
x
.
shape
)
==
2
:
if
len
(
x
.
shape
)
==
2
:
return
torch
.
addmm
(
bias
,
x
,
layer
.
weight
)
return
torch
.
addmm
(
bias
,
x
,
layer
.
weight
)
...
...
vllm/model_executor/model_loader/utils.py
View file @
112588c2
...
@@ -30,10 +30,10 @@ def get_model_architecture(
...
@@ -30,10 +30,10 @@ def get_model_architecture(
os
.
environ
[
'LLAMA_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
else
:
else
:
os
.
environ
[
'LLAMA_NN'
]
=
'1'
os
.
environ
[
'LLAMA_NN'
]
=
'1'
if
architectures
==
[
'BloomForCausalLM'
]
or
architectures
==
[
'LlamaForCausalLM'
]
:
if
architectures
==
[
'BloomForCausalLM'
]:
os
.
environ
[
'LM_
T
N'
]
=
'
1
'
os
.
environ
[
'LM_
N
N'
]
=
'
0
'
else
:
else
:
os
.
environ
[
'LM_
T
N'
]
=
'
0
'
os
.
environ
[
'LM_
N
N'
]
=
'
1
'
if
os
.
getenv
(
'GEMM_PAD'
)
!=
'1'
:
if
os
.
getenv
(
'GEMM_PAD'
)
!=
'1'
:
os
.
environ
[
'GEMM_PAD'
]
=
'0'
os
.
environ
[
'GEMM_PAD'
]
=
'0'
if
os
.
getenv
(
'FA_PAD'
)
!=
'1'
:
if
os
.
getenv
(
'FA_PAD'
)
!=
'1'
:
...
@@ -50,7 +50,7 @@ def get_model_architecture(
...
@@ -50,7 +50,7 @@ def get_model_architecture(
os
.
environ
[
'AWQ_PAD'
]
=
'0'
os
.
environ
[
'AWQ_PAD'
]
=
'0'
else
:
else
:
os
.
environ
[
'LLAMA_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
os
.
environ
[
'LM_
T
N'
]
=
'
1
'
os
.
environ
[
'LM_
N
N'
]
=
'
0
'
os
.
environ
[
'GEMM_PAD'
]
=
'0'
os
.
environ
[
'GEMM_PAD'
]
=
'0'
os
.
environ
[
'FA_PAD'
]
=
'0'
os
.
environ
[
'FA_PAD'
]
=
'0'
os
.
environ
[
'AWQ_PAD'
]
=
'0'
os
.
environ
[
'AWQ_PAD'
]
=
'0'
...
...
vllm/model_executor/models/llama.py
View file @
112588c2
...
@@ -455,6 +455,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
...
@@ -455,6 +455,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
self
.
use_llama_nn
=
os
.
environ
.
get
(
'LLAMA_NN'
)
==
'1'
self
.
use_lm_nn
=
os
.
environ
.
get
(
'LM_NN'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_gemm_pad
=
os
.
environ
.
get
(
'GEMM_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_fa_pad
=
os
.
environ
.
get
(
'FA_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
self
.
use_awq_pad
=
os
.
environ
.
get
(
'AWQ_PAD'
)
==
'1'
...
@@ -573,9 +574,12 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
...
@@ -573,9 +574,12 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
"self_attn.qkv_proj.weight"
,
"self_attn.qkv_proj.weight"
,
"self_attn.o_proj.weight"
,
"self_attn.o_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.gate_up_proj.weight"
,
"mlp.down_proj.weight"
,
"mlp.down_proj.weight"
# "lm_head.weight"
]
]
if
self
.
use_lm_nn
:
lay_key_words
.
append
(
"lm_head.weight"
)
combined_words
=
"|"
.
join
(
lay_key_words
)
combined_words
=
"|"
.
join
(
lay_key_words
)
lay_qkv_words
=
[
"self_attn.qkv_proj.weight"
]
lay_qkv_words
=
[
"self_attn.qkv_proj.weight"
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment