set MOE_NN=0, VLLM_USE_FUSED_RMS_ROPE=0, VLLM_USE_FUSE_SILU_AND_MUL=0 and VLLM_W8A8_BACKEND=1

3eccb64e · zhuwenwen · 39562a7f · 3eccb64e · 3eccb64e · 3eccb64e
Commit 3eccb64e authored Jan 30, 2026 by zhuwenwen
3 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1806,14 +1806,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
                ("true", "1")),
    # vLLM will use fused RMS + RoPE kernel
    "VLLM_USE_FUSED_RMS_ROPE":
-        lambda: (os.environ.get("VLLM_USE_FUSED_RMS_ROPE", "True").lower() in
+        lambda: (os.environ.get("VLLM_USE_FUSED_RMS_ROPE", "False").lower() in
                 ("true", "1")),
    # W8A8 GEMM backend selection for vLLM quantized models.
    # lightop/triton: 1
    # cutlass: 2 (will remove in the future)
    # blaslt: 3 (default)
    # rocblas: others
-    "VLLM_W8A8_BACKEND": lambda: int(os.getenv("VLLM_W8A8_BACKEND", "3")),
+    "VLLM_W8A8_BACKEND": lambda: int(os.getenv("VLLM_W8A8_BACKEND", "1")),
 }
 # --8<-- [end:env-vars-definition]

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -664,7 +664,7 @@ class FusedMoE(CustomOp):
        if quant_config is None:
            # Not considering quant for now, temporarily
-            self.use_nn_moe = int(os.environ.get('MOE_NN', 1)) == 1
+            self.use_nn_moe = int(os.environ.get('MOE_NN', 0)) == 1
        else:
            self.use_nn_moe = False

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -209,8 +209,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                        os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'    
-                    if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
+                    # if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
-                        os.environ['VLLM_USE_FUSE_SILU_AND_MUL'] = '1'
+                    #     os.environ['VLLM_USE_FUSE_SILU_AND_MUL'] = '1'
                    if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'
@@ -241,8 +241,8 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                        os.environ['VLLM_USE_LIGHTOP_MOE_ALIGN'] = '1'
                    if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM"):
                        os.environ['VLLM_USE_LIGHTOP_MOE_SUM'] = '1'    
-                    if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
+                    # if not envs.is_set("VLLM_USE_FUSE_SILU_AND_MUL"):
-                        os.environ['VLLM_USE_FUSE_SILU_AND_MUL'] = '1'
+                    #     os.environ['VLLM_USE_FUSE_SILU_AND_MUL'] = '1'
                    if not envs.is_set("VLLM_USE_OPT_RESHAPE_AND_CACHE"):
                        os.environ['VLLM_USE_OPT_RESHAPE_AND_CACHE'] = '1'