Merge branch 'v0.9.2-dev-tc_opt' into 'v0.9.2-dev'

feat(moe)：新增 VLLM_USE_MOE_W16A16_TRTION 强制 Triton MoE See merge request dcutoolkit/deeplearing/vllm!396

Merge branch 'v0.9.2-dev-tc_opt' into 'v0.9.2-dev'
feat(moe)：新增 VLLM_USE_MOE_W16A16_TRTION 强制 Triton MoE See merge request dcutoolkit/deeplearing/vllm!396
f35ea024 · zhuwenwen · 19d458ec · cedfe391 · f35ea024 · f35ea024
Commit f35ea024 authored Jan 28, 2026 by zhuwenwen
3 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -216,6 +216,7 @@ if TYPE_CHECKING:
    VLLM_MOE_ROUTER_CAPTURE_MAX_LAYERS: int = 0
    VLLM_MOE_ROUTER_CAPTURE_NUM_TOKENS_GT: int = -1
    VLLM_MOE_ROUTER_CAPTURE_NUM_TOKENS_LT: int = -1
+    VLLM_USE_MOE_W16A16_TRITON: bool = False
 def get_default_cache_root():
    return os.getenv(
@@ -1383,6 +1384,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Only capture when num_tokens < N (0 disables).
    "VLLM_MOE_ROUTER_CAPTURE_NUM_TOKENS_LT":
    lambda: int(os.environ.get("VLLM_MOE_ROUTER_CAPTURE_NUM_TOKENS_LT", "-1")),
+    # Force using Triton MoE path (disable Marlin W16A16 MoE).
+    "VLLM_USE_MOE_W16A16_TRITON":
+        lambda: (os.environ.get("VLLM_USE_MOE_W16A16_TRITON", "0").lower() in
+                 ("true", "1")),
 }
 # --8<-- [end:env-vars-definition]

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1711,6 +1711,11 @@ def fused_experts_impl(
                     or getattr(w2, "marlin_w16a16_packed", False)
                     or _is_marlin_w16a16_packed(w1, w2))
        if is_packed:
+            if envs.VLLM_USE_MOE_W16A16_TRITON:
+                raise RuntimeError(
+                    "VLLM_USE_MOE_W16A16_TRITON=1 forces Triton MoE, but the MoE weights are "
+                    "packed in Marlin W16A16 layout. Please load unpacked weights or set "
+                    "VLLM_USE_MOE_W16A16_TRITON=0.")
            try:
                from vllm.model_executor.layers.fused_moe.fuse_moe_w16a16_marlin import (  # noqa: E501
                    fused_experts_impl_w16a16_marlin)

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -101,9 +101,6 @@ def _is_marlin_w16a16_moe_supported(
        return False
    if E <= 0 or N <= 0 or K <= 0 or top_k <= 0:
        return False
-    if not envs.VLLM_USE_LIGHTOP:
-        return False
    try:
        from lightop import get_moe_cuda_marlin_config_w16a16
@@ -1051,7 +1048,9 @@ class FusedMoE(torch.nn.Module):
            # Not considering quant for now, temporarily
            moe_in_dtype = model_dtype
            self._marlin_w16a16_moe_enabled = (
-                params_dtype == moe_in_dtype and self.activation == "silu"
+                not envs.VLLM_USE_MOE_W16A16_TRITON
+                and params_dtype == moe_in_dtype
+                and self.activation == "silu"
                and not self.apply_router_weight_on_input
                and _is_marlin_w16a16_moe_supported(
                    E=self.local_num_experts,