Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
105b8ce4
Unverified
Commit
105b8ce4
authored
Feb 22, 2025
by
Jee Jee Li
Committed by
GitHub
Feb 22, 2025
Browse files
[Misc] Reduce LoRA-related static variable (#13166)
parent
2cb8c154
Changes
41
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
35 additions
and
241 deletions
+35
-241
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internlm2.py
+0
-10
vllm/model_executor/models/jamba.py
vllm/model_executor/models/jamba.py
+0
-4
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+0
-4
vllm/model_executor/models/minicpm.py
vllm/model_executor/models/minicpm.py
+0
-8
vllm/model_executor/models/minicpm3.py
vllm/model_executor/models/minicpm3.py
+0
-16
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+0
-42
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral.py
+0
-4
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+0
-20
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/nemotron.py
+0
-3
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi.py
+0
-11
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/phimoe.py
+0
-10
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+0
-9
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+0
-20
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+0
-21
vllm/model_executor/models/qwen2_rm.py
vllm/model_executor/models/qwen2_rm.py
+0
-10
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+0
-18
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+0
-15
vllm/model_executor/models/solar.py
vllm/model_executor/models/solar.py
+0
-8
vllm/model_executor/models/transformers.py
vllm/model_executor/models/transformers.py
+35
-0
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+0
-8
No files found.
vllm/model_executor/models/internlm2.py
View file @
105b8ce4
...
...
@@ -329,16 +329,6 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
"gate_up_proj"
:
[
"w1"
,
"w3"
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"wqkv"
,
"wo"
,
"gate_up_proj"
,
"w2"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
...
...
vllm/model_executor/models/jamba.py
View file @
105b8ce4
...
...
@@ -380,10 +380,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"embed_tokens"
,
"lm_head"
,
"up_proj"
,
"down_proj"
,
"gate_proj"
,
"out_proj"
,
"in_proj"
,
"x_proj"
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
...
...
vllm/model_executor/models/llama.py
View file @
105b8ce4
...
...
@@ -452,10 +452,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
"embed_tokens"
,
"lm_head"
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
...
...
vllm/model_executor/models/minicpm.py
View file @
105b8ce4
...
...
@@ -522,14 +522,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
"embed_tokens"
,
"lm_head"
,
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
...
...
vllm/model_executor/models/minicpm3.py
View file @
105b8ce4
...
...
@@ -227,21 +227,5 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"kv_a_proj_with_mqa"
,
"q_a_proj"
,
"q_b_proj"
,
"kv_b_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
"embed_tokens"
,
"lm_head"
,
]
# `embedding_modules` and `embedding_padding_modules`
# are inherited from MiniCPMForCausalLM
def
_init_model
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
return
MiniCPM3Model
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
vllm/model_executor/models/minicpmv.py
View file @
105b8ce4
...
...
@@ -1228,23 +1228,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
"up_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
# vision encoder
"fc1"
,
"fc2"
,
"out_proj"
,
# language model
"qkv_proj"
,
# same name with vision encoder
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
# resampler
"kv_proj"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
...
...
@@ -1338,23 +1321,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
"up_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
# vision encoder
"fc1"
,
"fc2"
,
"out_proj"
,
# language model
"qkv_proj"
,
# same name with vision encoder
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
# resampler
"kv_proj"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
...
...
@@ -1460,13 +1426,6 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
which is not conducive to the current integration logic of LoRA and
bitsandbytes in vLLM. Therefore, it is necessary to separate them.
"""
# Ensure that the LoRA support check passes when the class is not
# initialized, but set all these attributes to empty.
# These will be updated when an instance class is selected
packed_modules_mapping
=
{}
supported_lora_modules
=
[]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__new__
(
cls
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
...
...
@@ -1487,7 +1446,6 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
# quant_config references base class members,
# so update values before init is called
cls
.
packed_modules_mapping
.
update
(
instance_cls
.
packed_modules_mapping
)
cls
.
supported_lora_modules
+=
instance_cls
.
supported_lora_modules
cls
.
embedding_modules
.
update
(
instance_cls
.
embedding_modules
)
cls
.
embedding_padding_modules
+=
instance_cls
.
embedding_padding_modules
return
instance_cls
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
vllm/model_executor/models/mixtral.py
View file @
105b8ce4
...
...
@@ -332,10 +332,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"embed_tokens"
,
"lm_head"
,
"w1"
,
"w2"
,
"w3"
,
"gate"
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
...
...
vllm/model_executor/models/molmo.py
View file @
105b8ce4
...
...
@@ -1440,26 +1440,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
"merged_linear"
:
[
"gate_proj"
,
"up_proj"
]
# image_projector
}
# LoRA specific attributes
supported_lora_modules
=
[
# language model
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
# same name with image_projector
# vision tower
"wq"
,
"wk"
,
"wv"
,
"wo"
,
"w1"
,
"w2"
,
# image_projector
"merged_linear"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
...
...
vllm/model_executor/models/nemotron.py
View file @
105b8ce4
...
...
@@ -389,9 +389,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"up_proj"
,
"down_proj"
,
"embed_tokens"
,
"lm_head"
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
...
...
vllm/model_executor/models/phi.py
View file @
105b8ce4
...
...
@@ -273,17 +273,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
]
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"dense"
,
"fc1"
,
"fc2"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
...
...
vllm/model_executor/models/phimoe.py
View file @
105b8ce4
...
...
@@ -526,16 +526,6 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"embed_tokens"
,
"lm_head"
,
"w1"
,
"w2"
,
"w3"
,
"gate"
,
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
...
...
vllm/model_executor/models/qwen.py
View file @
105b8ce4
...
...
@@ -354,15 +354,6 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
"w1"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"c_attn"
,
"gate_up_proj"
,
"c_proj"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
config
=
vllm_config
.
model_config
.
hf_config
...
...
vllm/model_executor/models/qwen2.py
View file @
105b8ce4
...
...
@@ -430,16 +430,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
...
...
@@ -528,16 +518,6 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
""
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
105b8ce4
...
...
@@ -734,27 +734,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
"up_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
# language model
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
# Same name with vision encoder
# vision tower
"qkv"
,
"gate_proj"
,
"up_proj"
,
"attn.proj"
,
# Distinguish patch_embed.proj
"fc1"
,
"fc2"
,
# projector
"mlp.0"
,
"mlp.2"
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
...
...
vllm/model_executor/models/qwen2_rm.py
View file @
105b8ce4
...
...
@@ -47,16 +47,6 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
105b8ce4
...
...
@@ -1048,24 +1048,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
# vision tower
"qkv"
,
"attn.proj"
,
# Distinguish patch_embed.proj
"fc1"
,
"fc2"
,
# projector
"mlp.0"
,
"mlp.2"
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"lm_head."
:
"language_model.lm_head."
,
...
...
vllm/model_executor/models/qwen_vl.py
View file @
105b8ce4
...
...
@@ -667,21 +667,6 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
"w1"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"c_attn"
,
"gate_up_proj"
,
"c_proj"
,
# visual module
"out_proj"
,
"in_proj"
,
"c_fc"
,
# resampler
"kv_proj"
,
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
def
get_mm_mapping
(
self
)
->
MultiModelKeys
:
"""
...
...
vllm/model_executor/models/solar.py
View file @
105b8ce4
...
...
@@ -386,14 +386,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
,
"embed_tokens"
,
"lm_head"
,
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
...
...
vllm/model_executor/models/transformers.py
View file @
105b8ce4
...
...
@@ -27,6 +27,11 @@ from vllm.config import VllmConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed.utils
import
divide
from
vllm.logger
import
init_logger
from
vllm.lora.fully_sharded_layers
import
(
ColumnParallelLinearWithShardedLoRA
,
RowParallelLinearWithShardedLoRA
)
from
vllm.lora.layers
import
(
ColumnParallelLinearWithLoRA
,
ReplicatedLinearWithLoRA
,
RowParallelLinearWithLoRA
)
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ReplicatedLinear
,
RowParallelLinear
)
...
...
@@ -103,6 +108,23 @@ def replace_linear_class(
"rowwise"
:
RowParallelLinear
,
}.
get
(
style
,
ReplicatedLinear
)
lora_linear_cls
=
{
ColumnParallelLinear
:
{
True
:
ColumnParallelLinearWithShardedLoRA
,
# fully sharded
False
:
ColumnParallelLinearWithLoRA
# not fully sharded
},
RowParallelLinear
:
{
True
:
RowParallelLinearWithShardedLoRA
,
False
:
RowParallelLinearWithLoRA
},
# ReplicatedLinear doesn't support fully sharded LoRA yet,
# so we use the same class for both cases.
ReplicatedLinear
:
{
True
:
ReplicatedLinearWithLoRA
,
False
:
ReplicatedLinearWithLoRA
}
}
class
HFCompatibleLinear
(
vllm_linear_cls
):
"""
Wrapper class that removes `output_bias` from returned output.
...
...
@@ -111,6 +133,19 @@ def replace_linear_class(
def
forward
(
self
,
input
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
super
().
forward
(
input
)[
0
]
@
classmethod
def
get_lora_class
(
cls
,
fully_sharded
:
bool
=
False
):
"""
Get the LoRA class corresponding to the current transformer
linear class.
Args:
fully_sharded (bool): If True, select the LoRA class variant
that supports fully sharded LoRA. Defaults to False.
"""
return
lora_linear_cls
[
vllm_linear_cls
][
fully_sharded
]
return
HFCompatibleLinear
(
input_size
=
linear
.
in_features
,
output_size
=
linear
.
out_features
,
...
...
vllm/model_executor/models/ultravox.py
View file @
105b8ce4
...
...
@@ -360,14 +360,6 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
"gate_up_proj"
:
[
"gate_proj"
,
"up_proj"
]
}
# LoRA specific attributes
# TODO : Add LoRA to the audio tower and projector.
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"gate_up_proj"
,
"down_proj"
]
embedding_modules
=
{}
embedding_padding_modules
=
[]
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"audio_tower.model.encoder."
:
"audio_tower."
})
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment