• feat(moe/marlin): 移除 VLLM_USE_MARLIN_W16A16_MOE，改为基于 lightop 探测自动启用并一次性缓存决策

- 使用 get_moe_cuda_marlin_config_w16a16(status) 判断 W16A16 Marlin MoE 是否可用 - 在 FusedMoE 初始化阶段计算并缓存 _marlin_w16a16_moe_enabled，满足条件时强制 use_nn_moe=False - 权重加载后按缓存结果进行一次性 Marlin pack；运行时按 packed 标记走 Marlin fast path - 删除 envs.py 中 VLLM_USE_MARLIN_W16A16_MOE 环境变量定义与解析逻辑

• feat(moe/marlin): 移除 VLLM_USE_MARLIN_W16A16_MOE，改为基于 lightop 探测自动启用并一次性缓存决策
- 使用 get_moe_cuda_marlin_config_w16a16(status) 判断 W16A16 Marlin MoE 是否可用 - 在 FusedMoE 初始化阶段计算并缓存 _marlin_w16a16_moe_enabled，满足条件时强制 use_nn_moe=False - 权重加载后按缓存结果进行一次性 Marlin pack；运行时按 packed 标记走 Marlin fast path - 删除 envs.py 中 VLLM_USE_MARLIN_W16A16_MOE 环境变量定义与解析逻辑
714a7573 · laibao · 6fa116fb · 714a7573 · 714a7573 · 714a7573
Commit 714a7573 authored Jan 20, 2026 by laibao
3 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -244,7 +244,6 @@ if TYPE_CHECKING:
    VLLM_USE_OPT_RESHAPE_AND_CACHE: bool = False
    VLLM_USE_TOPK_RENORM: bool = False
    VLLM_USE_FUSED_RMS_ROPE: bool = False
-    VLLM_USE_MARLIN_W16A16_MOE:bool = False
    VLLM_V1_FAST_TOKEN_ID_COPY: bool = False
    VLLM_V1_USE_REDUCED_TOPK_TOPP_SAMPLER: bool = False
    VLLM_W8A8_BACKEND: int = 3
@@ -1689,10 +1688,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_FUSED_RMS_ROPE":
        lambda: (os.environ.get("VLLM_USE_FUSED_RMS_ROPE", "True").lower() in
                 ("true", "1")),
-    # vLLM will use Marlin W16A16 kernel for MoE experts
-    "VLLM_USE_MARLIN_W16A16_MOE":
-        lambda: (os.environ.get("VLLM_USE_MARLIN_W16A16_MOE", "False").lower() in
-                 ("true", "1")),
    # vLLM will use fast token id copy
    "VLLM_V1_FAST_TOKEN_ID_COPY":
        lambda: (os.environ.get("VLLM_V1_FAST_TOKEN_ID_COPY", "False").lower() in

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1954,17 +1954,9 @@ def fused_experts_impl(
    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
    M = min(num_tokens, CHUNK_SIZE)

-    # Optional fast path: use Marlin W16A16 fused MoE implementation when
-    # explicitly requested. When weights are pre-packed in the post-load hook,
-    # w1/w2 are already in Marlin layout and we can avoid first-run packing
-    # peaks during KV cache profiling.
-    if envs.VLLM_USE_MARLIN_W16A16_MOE and not use_nn_moe:
-        try:
-            from vllm.model_executor.layers.fused_moe.fuse_moe_w16a16_marlin import (  # noqa: E501
-                fused_experts_impl_w16a16_marlin)
-        except Exception:
-            fused_experts_impl_w16a16_marlin = None  # type: ignore
-
+    # Optional fast path: use Marlin W16A16 fused MoE implementation when the
+    # expert weights are already packed in Marlin layout.
+    if not use_nn_moe:
        K = hidden_states.size(1)

        def _is_marlin_w16a16_packed(w1: torch.Tensor,
@@ -1992,12 +1984,29 @@ def fused_experts_impl(
                     or getattr(w2, "marlin_w16a16_packed", False)
                     or _is_marlin_w16a16_packed(w1, w2))
        if is_packed:
+            try:
+                from vllm.model_executor.layers.fused_moe.fuse_moe_w16a16_marlin import (  # noqa: E501
+                    fused_experts_impl_w16a16_marlin)
+            except Exception:
+                fused_experts_impl_w16a16_marlin = None  # type: ignore
+
            if fused_experts_impl_w16a16_marlin is None:
                raise RuntimeError(
                    "Marlin W16A16 MoE weights are packed, but the Marlin kernel is unavailable. "
                    "Ensure lightop/lmslim is installed and LMSLIM_USE_LIGHTOP=1."
                )

+            if activation != "silu":
+                raise RuntimeError(
+                    "Marlin W16A16 MoE only supports activation='silu'.")
+            if apply_router_weight_on_input:
+                raise RuntimeError(
+                    "Marlin W16A16 MoE does not support apply_router_weight_on_input=True."
+                )
+            if w1_bias is not None or w2_bias is not None:
+                raise RuntimeError(
+                    "Marlin W16A16 MoE does not support expert biases.")
+
            E = w1.size(0)
            if global_num_experts == -1:
                global_num_experts = E

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import functools
 import os
 import importlib

@@ -93,6 +94,63 @@ else:

 logger = init_logger(__name__)

+_MARLIN_W16A16_MOE_PROBE_BATCH_SIZES: tuple[int, ...] = (1, 128)
+
+
+@functools.lru_cache
+def _is_marlin_w16a16_moe_supported(
+    E: int,
+    N: int,
+    K: int,
+    top_k: int,
+    dtype: torch.dtype,
+) -> bool:
+    """Return True if lightop reports Marlin W16A16 MoE is supported.
+
+    This is a best-effort probe used to decide whether we can safely pre-pack
+    weights into Marlin layout (which would otherwise prevent fallback).
+    """
+    if not (current_platform.is_cuda_alike() and torch.cuda.is_available()):
+        return False
+    if dtype not in (torch.float16, torch.bfloat16):
+        return False
+    if K % 32 != 0 or N % 16 != 0:
+        return False
+    if E <= 0 or N <= 0 or K <= 0 or top_k <= 0:
+        return False
+
+    try:
+        import lmslim.envs as lsenvs
+        if not lsenvs.LMSLIM_USE_LIGHTOP:
+            return False
+
+        from lightop import get_moe_cuda_marlin_config_w16a16
+
+        device_name = lsenvs.LMSLIM_GPU_NAME
+        if not device_name:
+            return False
+        num_cus = torch.cuda.get_device_properties(
+            torch.cuda.current_device()).multi_processor_count
+        twoN = 2 * N
+        for bs in _MARLIN_W16A16_MOE_PROBE_BATCH_SIZES:
+            _, _, status = get_moe_cuda_marlin_config_w16a16(
+                E,
+                bs,
+                twoN,
+                K,
+                K,
+                N,
+                top_k,
+                device_name,
+                num_cus,
+                dtype,
+            )
+            if not status:
+                return False
+        return True
+    except Exception:
+        return False
+

 class FusedMoeWeightScaleSupported(Enum):
    TENSOR = "tensor"
@@ -441,12 +499,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        super().process_weights_after_loading(layer)

-        # If Marlin W16A16 MoE is enabled, pre-pack weights once during the
+        # If Marlin W16A16 MoE is supported, pre-pack weights once during the
        # post-load hook and replace parameters with the packed layout.
        #
        # This avoids first-run packing peaks during KV cache profiling and
        # keeps only one copy of weights resident on GPU in steady state.
-        if (envs.VLLM_USE_MARLIN_W16A16_MOE and current_platform.is_cuda_alike()
+        if (getattr(layer, "_marlin_w16a16_moe_enabled", False)
+                and current_platform.is_cuda_alike()
                and not getattr(layer, "use_nn_moe", False)
                and not getattr(layer, "_marlin_w16a16_moe_packed", False)):
            w1 = layer.w13_weight
@@ -455,12 +514,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                    and w1.dtype in (torch.float16, torch.bfloat16)
                    and w2.dtype in (torch.float16, torch.bfloat16)):
                try:
-                    from vllm.model_executor.layers.fused_moe.fuse_moe_w16a16_marlin import (  # noqa: E501
-                        use_lightop as _use_lightop)
-                    if not _use_lightop:
-                        raise RuntimeError(
-                            "Marlin W16A16 MoE kernel is disabled")
-
                    if w1.dim() != 3 or w2.dim() != 3 or w1.size(0) != w2.size(
                            0):
                        raise RuntimeError("Unexpected MoE weight shapes")
@@ -1252,9 +1305,25 @@ class FusedMoE(CustomOp):
                
        if quant_config is None:
            # Not considering quant for now, temporarily
-            self.use_nn_moe = int(os.environ.get('MOE_NN', 1)) == 1
+            self._marlin_w16a16_moe_enabled = (
+                params_dtype == moe_in_dtype and not self.moe_config.has_bias
+                and self.activation == "silu"
+                and not self.apply_router_weight_on_input
+                and _is_marlin_w16a16_moe_supported(
+                    E=self.local_num_experts,
+                    N=self.intermediate_size_per_partition,
+                    K=self.hidden_size,
+                    top_k=self.top_k,
+                    dtype=moe_in_dtype,
+                ))
+
+            self.use_nn_moe = int(os.environ.get("MOE_NN", 1)) == 1
+            # Marlin W16A16 MoE requires the non-NN weight layout.
+            if self._marlin_w16a16_moe_enabled:
+                self.use_nn_moe = False
        else:
            self.use_nn_moe = False
+            self._marlin_w16a16_moe_enabled = False

        moe_quant_params = {
            "num_experts": self.local_num_experts,