[LoRA] Cleanup LoRA unused code (#29611)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>

[LoRA] Cleanup LoRA unused code (#29611)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
39e63dec · Jee Jee Li · GitHub · 4a80ad0a · 39e63dec · 39e63dec
Unverified Commit 39e63dec authored Nov 29, 2025 by Jee Jee Li Committed by GitHub Nov 28, 2025
20 changed files
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -34,12 +34,10 @@ class WorkerLoRAManager:
        vllm_config: VllmConfig,
        device: torch.device,
        embedding_modules: dict[str, str],
-        embedding_padding_modules: list[str],
        lora_model_cls: type[LoRAModel] = LoRAModel,
    ):
        self._lora_model_cls = lora_model_cls
        self.embedding_modules = embedding_modules
-        self.embedding_padding_modules = embedding_padding_modules
        self._cached_dummy_lora: None | Literal[False] | LoRAModel = False
        self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
        self.max_num_batched_tokens = (
@@ -121,9 +119,7 @@ class WorkerLoRAManager:
                lora_model_id=lora_request.lora_int_id,
                device="cpu",
                dtype=self.lora_config.lora_dtype,
-                target_embedding_padding=self.vocab_size,
+                model_vocab_size=self.vocab_size,
-                embedding_modules=self.embedding_modules,
-                embedding_padding_modules=self.embedding_padding_modules,
                tensorizer_config_dict=lora_request.tensorizer_config_dict,
                weights_mapper=hf_to_vllm_mapper,
            )

--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -482,7 +482,6 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(
        self,

--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -419,7 +419,6 @@ class BambaForCausalLM(
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    @classmethod
    def get_mamba_state_dtype_from_config(

--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -457,7 +457,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        "wte": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -450,7 +450,6 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -510,7 +510,6 @@ class FalconH1ForCausalLM(
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    @classmethod
    def get_mamba_state_dtype_from_config(

--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -400,7 +400,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -497,7 +497,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -601,7 +601,6 @@ class GraniteMoeHybridForCausalLM(
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    @classmethod
    def get_mamba_state_dtype_from_config(

--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -263,7 +263,6 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -347,7 +347,6 @@ class SupportsLoRA(Protocol):
    # The `embedding_module` and `embedding_padding_modules`
    # are empty by default.
    embedding_modules: ClassVar[dict[str, str]] = {}
-    embedding_padding_modules: ClassVar[list[str]] = []
    packed_modules_mapping: dict[str, list[str]] = {}
@@ -359,7 +358,6 @@ class _SupportsLoRAType(Protocol):
    packed_modules_mapping: dict[str, list[str]]
    embedding_modules: dict[str, str]
-    embedding_padding_modules: list[str]
 @overload
@@ -379,7 +377,6 @@ def supports_lora(
        lora_attrs = (
            "packed_modules_mapping",
            "embedding_modules",
-            "embedding_padding_modules",
        )
        missing_attrs = tuple(attr for attr in lora_attrs if not hasattr(model, attr))

--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -480,7 +480,6 @@ class JambaForCausalLM(
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        config = vllm_config.model_config.hf_config

--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -422,7 +422,6 @@ class Lfm2ForCausalLM(
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    @classmethod
    def get_mamba_state_dtype_from_config(

--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -602,7 +602,6 @@ class Lfm2MoeForCausalLM(
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    @classmethod
    def get_mamba_state_dtype_from_config(

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -528,7 +528,6 @@ class LlamaForCausalLM(
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    # Mistral/Llama models can also be loaded with --load-format mistral
    # from consolidated.safetensors checkpoints

--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -568,7 +568,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -305,7 +305,6 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1741,5 +1741,4 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
        # so update values before init is called
        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
        cls.embedding_modules.update(instance_cls.embedding_modules)
-        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
        return instance_cls(vllm_config=vllm_config, prefix=prefix)
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -496,7 +496,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()

--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -439,7 +439,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
        "embed_tokens": "input_embeddings",
        "lm_head": "output_embeddings",
    }
-    embedding_padding_modules = ["lm_head"]
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()