[BugFix] Fix LoRA Fp8 (#33879)

Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>

[BugFix] Fix LoRA Fp8 (#33879)
Signed-off-by: Daniel Serebrenik <daserebrenik@nvidia.com>
5b2a9422 · danisereb · GitHub · c1858b7e · 5b2a9422
Unverified Commit 5b2a9422 authored Feb 05, 2026 by danisereb Committed by GitHub Feb 05, 2026
Show whitespace changes
Inline Side-by-side

Showing with 14 additions and 8 deletions

vllm/lora/layers/fused_moe.py vllm/lora/layers/fused_moe.py +14 -8

No files found.
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -130,6 +130,11 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
        self.base_layer.ensure_moe_quant_config_init()
        quant_config = self.base_layer.quant_method.moe_quant_config

+        if getattr(self.base_layer.quant_method, "supports_internal_mk", False):
+            # Use the existing modular kernel from the quant method
+            m_fused_moe_fn = self.base_layer.quant_method.moe_mk
+        else:
+            # Create a new modular kernel via select_gemm_impl
            prepare_finalize = MoEPrepareAndFinalizeNoEP()
            m_fused_moe_fn = FusedMoEModularKernel(
                prepare_finalize,
@@ -138,6 +143,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA):
                ),
                self.base_layer.shared_experts,
            )
+
        if quant_config.use_mxfp4_w4a16:
            assert isinstance(
                m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)