Unverified Commit 105b8ce4 authored by Jee Jee Li's avatar Jee Jee Li Committed by GitHub
Browse files

[Misc] Reduce LoRA-related static variable (#13166)

parent 2cb8c154
......@@ -329,16 +329,6 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
"gate_up_proj": ["w1", "w3"],
}
# LoRA specific attributes
supported_lora_modules = [
"wqkv",
"wo",
"gate_up_proj",
"w2",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(self,
*,
vllm_config: VllmConfig,
......
......@@ -380,10 +380,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj", "o_proj", "embed_tokens", "lm_head", "up_proj",
"down_proj", "gate_proj", "out_proj", "in_proj", "x_proj"
]
embedding_modules = {
"embed_tokens": "input_embeddings",
"lm_head": "output_embeddings",
......
......@@ -452,10 +452,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
"lm_head"
]
embedding_modules = {
"embed_tokens": "input_embeddings",
"lm_head": "output_embeddings"
......
......@@ -522,14 +522,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
"embed_tokens",
"lm_head",
]
embedding_modules = {
"embed_tokens": "input_embeddings",
"lm_head": "output_embeddings",
......
......@@ -227,21 +227,5 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
],
}
# LoRA specific attributes
supported_lora_modules = [
"kv_a_proj_with_mqa",
"q_a_proj",
"q_b_proj",
"kv_b_proj",
"o_proj",
"gate_up_proj",
"down_proj",
"embed_tokens",
"lm_head",
]
# `embedding_modules` and `embedding_padding_modules`
# are inherited from MiniCPMForCausalLM
def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix)
......@@ -1228,23 +1228,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
"up_proj",
],
}
# LoRA specific attributes
supported_lora_modules = [
# vision encoder
"fc1",
"fc2",
"out_proj",
# language model
"qkv_proj", # same name with vision encoder
"o_proj",
"gate_up_proj",
"down_proj",
# resampler
"kv_proj",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
......@@ -1338,23 +1321,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
"up_proj",
],
}
# LoRA specific attributes
supported_lora_modules = [
# vision encoder
"fc1",
"fc2",
"out_proj",
# language model
"qkv_proj", # same name with vision encoder
"o_proj",
"gate_up_proj",
"down_proj",
# resampler
"kv_proj",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
......@@ -1460,13 +1426,6 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
which is not conducive to the current integration logic of LoRA and
bitsandbytes in vLLM. Therefore, it is necessary to separate them.
"""
# Ensure that the LoRA support check passes when the class is not
# initialized, but set all these attributes to empty.
# These will be updated when an instance class is selected
packed_modules_mapping = {}
supported_lora_modules = []
embedding_modules = {}
embedding_padding_modules = []
def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config
......@@ -1487,7 +1446,6 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
# quant_config references base class members,
# so update values before init is called
cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
cls.supported_lora_modules += instance_cls.supported_lora_modules
cls.embedding_modules.update(instance_cls.embedding_modules)
cls.embedding_padding_modules += instance_cls.embedding_padding_modules
return instance_cls(vllm_config=vllm_config, prefix=prefix)
......@@ -332,10 +332,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj", "o_proj", "embed_tokens", "lm_head", "w1", "w2", "w3",
"gate"
]
embedding_modules = {
"embed_tokens": "input_embeddings",
"lm_head": "output_embeddings",
......
......@@ -1440,26 +1440,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
"merged_linear": ["gate_proj", "up_proj"] # image_projector
}
# LoRA specific attributes
supported_lora_modules = [
# language model
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj", # same name with image_projector
# vision tower
"wq",
"wk",
"wv",
"wo",
"w1",
"w2",
# image_projector
"merged_linear",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
......
......@@ -389,9 +389,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj", "o_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"
]
embedding_modules = {
"embed_tokens": "input_embeddings",
"lm_head": "output_embeddings",
......
......@@ -273,17 +273,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
]
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"dense",
"fc1",
"fc2",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
......
......@@ -526,16 +526,6 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"o_proj",
"embed_tokens",
"lm_head",
"w1",
"w2",
"w3",
"gate",
]
embedding_modules = {
"embed_tokens": "input_embeddings",
"lm_head": "output_embeddings",
......
......@@ -354,15 +354,6 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
"w1",
],
}
# LoRA specific attributes
supported_lora_modules = [
"c_attn",
"gate_up_proj",
"c_proj",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config
......
......@@ -430,16 +430,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
],
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
......@@ -528,16 +518,6 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
],
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
]
embedding_modules = {}
embedding_padding_modules = []
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
......
......@@ -734,27 +734,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
"up_proj",
],
}
# LoRA specific attributes
supported_lora_modules = [
# language model
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj", # Same name with vision encoder
# vision tower
"qkv",
"gate_proj",
"up_proj",
"attn.proj", # Distinguish patch_embed.proj
"fc1",
"fc2",
# projector
"mlp.0",
"mlp.2"
]
embedding_modules = {}
embedding_padding_modules = []
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
......
......@@ -47,16 +47,6 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
],
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
......
......@@ -1048,24 +1048,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
],
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
# vision tower
"qkv",
"attn.proj", # Distinguish patch_embed.proj
"fc1",
"fc2",
# projector
"mlp.0",
"mlp.2"
]
embedding_modules = {}
embedding_padding_modules = []
# To ensure correct weight loading and mapping.
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
"lm_head.": "language_model.lm_head.",
......
......@@ -667,21 +667,6 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
"w1",
],
}
# LoRA specific attributes
supported_lora_modules = [
"c_attn",
"gate_up_proj",
"c_proj",
# visual module
"out_proj",
"in_proj",
"c_fc",
# resampler
"kv_proj",
]
embedding_modules = {}
embedding_padding_modules = []
def get_mm_mapping(self) -> MultiModelKeys:
"""
......
......@@ -386,14 +386,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
"embed_tokens",
"lm_head",
]
embedding_modules = {
"embed_tokens": "input_embeddings",
"lm_head": "output_embeddings",
......
......@@ -27,6 +27,11 @@ from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.distributed.utils import divide
from vllm.logger import init_logger
from vllm.lora.fully_sharded_layers import (
ColumnParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA)
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
ReplicatedLinearWithLoRA,
RowParallelLinearWithLoRA)
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
ReplicatedLinear,
RowParallelLinear)
......@@ -103,6 +108,23 @@ def replace_linear_class(
"rowwise": RowParallelLinear,
}.get(style, ReplicatedLinear)
lora_linear_cls = {
ColumnParallelLinear: {
True: ColumnParallelLinearWithShardedLoRA, # fully sharded
False: ColumnParallelLinearWithLoRA # not fully sharded
},
RowParallelLinear: {
True: RowParallelLinearWithShardedLoRA,
False: RowParallelLinearWithLoRA
},
# ReplicatedLinear doesn't support fully sharded LoRA yet,
# so we use the same class for both cases.
ReplicatedLinear: {
True: ReplicatedLinearWithLoRA,
False: ReplicatedLinearWithLoRA
}
}
class HFCompatibleLinear(vllm_linear_cls):
"""
Wrapper class that removes `output_bias` from returned output.
......@@ -111,6 +133,19 @@ def replace_linear_class(
def forward(self, input: torch.Tensor) -> torch.Tensor:
return super().forward(input)[0]
@classmethod
def get_lora_class(cls, fully_sharded: bool = False):
"""
Get the LoRA class corresponding to the current transformer
linear class.
Args:
fully_sharded (bool): If True, select the LoRA class variant
that supports fully sharded LoRA. Defaults to False.
"""
return lora_linear_cls[vllm_linear_cls][fully_sharded]
return HFCompatibleLinear(
input_size=linear.in_features,
output_size=linear.out_features,
......
......@@ -360,14 +360,6 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
"gate_up_proj": ["gate_proj", "up_proj"]
}
# LoRA specific attributes
# TODO : Add LoRA to the audio tower and projector.
supported_lora_modules = [
"qkv_proj", "o_proj", "gate_up_proj", "down_proj"
]
embedding_modules = {}
embedding_padding_modules = []
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment