[Bugfix] Fix getting device for MoE LoRA (#29475)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>

[Bugfix] Fix getting device for MoE LoRA (#29475)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
c069086b · Jee Jee Li · GitHub · 11ea5ec1 · c069086b · c069086b
Unverified Commit c069086b authored Nov 27, 2025 by Jee Jee Li Committed by GitHub Nov 26, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 1 deletion

vllm/lora/layers/fused_moe.py vllm/lora/layers/fused_moe.py +3 -1

vllm/lora/layers/utils.py vllm/lora/layers/utils.py +9 -0

No files found.
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -30,6 +30,8 @@ from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
    FusedMoEModularMethod,
 )

+from .utils import _get_lora_device
+

 class FusedMoEWithLoRA(BaseLayerWithLoRA):
    def __init__(self, base_layer: FusedMoE) -> None:
@@ -41,7 +43,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
        )
        self.tp_size = get_tensor_model_parallel_world_size()
        self.tp_rank = get_tensor_model_parallel_rank()
-        self.device = base_layer.w2_weight.device
+        self.device = _get_lora_device(base_layer)
        self._w13_slices = 2
        self._inject_lora_into_fused_moe()


--- a/vllm/lora/layers/utils.py
+++ b/vllm/lora/layers/utils.py
@@ -33,6 +33,15 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
    # HQQ marlin
    elif hasattr(base_layer, "W_q"):
        return base_layer.W_q.device
+    # MoE layer
+    elif hasattr(base_layer, "w2_weight"):
+        return base_layer.w2_weight.device
+    # MoE Compressed Tensor
+    elif hasattr(base_layer, "w2_weight_packed"):
+        return base_layer.w2_weight_packed.device
+    # MoE GPTQ/AWQ/GGUF
+    elif hasattr(base_layer, "w2_qweight"):
+        return base_layer.w2_qweight.device
    else:
        raise ValueError(f"Unsupported base layer: {base_layer}")