remove USE_FUSED_RMS_QUANT and USE_FUSED_SILU_MUL_QUANT

60b37c6b · zhuwenwen · c964b9ad · 60b37c6b · 60b37c6b · 60b37c6b
Commit 60b37c6b authored Jan 07, 2026 by zhuwenwen
5 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -270,8 +270,6 @@ if TYPE_CHECKING:
    VLLM_USE_LIGHTOP_MOE_SUM: bool = False
    VLLM_USE_LIGHTOP_MOE_ALIGN: bool = False
    VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
-    USE_FUSED_RMS_QUANT: bool = False
-    USE_FUSED_SILU_MUL_QUANT: bool = False
    VLLM_USE_PD_SPLIT: bool = False
    VLLM_USE_PP_SYNC: bool = False
    VLLM_USE_PIECEWISE: bool = False
@@ -1726,14 +1724,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_MERGE_ATTN_STATES_OPT":
        lambda: (os.environ.get("VLLM_USE_MERGE_ATTN_STATES_OPT", "True").lower() in
                 ("true", "1")),  
-    # vllm will use rmsquant fused op 
-    "USE_FUSED_RMS_QUANT": 
-    lambda: (os.getenv('USE_FUSED_RMS_QUANT', '0').lower() in
-             ("true", "1")),
-    # vllm will use silu_mul_quant fused op 
-    "USE_FUSED_SILU_MUL_QUANT": 
-    lambda: (os.getenv('USE_FUSED_SILU_MUL_QUANT', '0').lower() in
-             ("true", "1")),
    # vLLM will split prefill and decode, not mix up
    "VLLM_USE_PD_SPLIT":
        lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "False").lower() in

--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1592,7 +1592,6 @@ class RowParallelLinear(LinearBase):
    def forward(
        self,
        input_,
-        use_fused_silu_mul_quant: bool | None = False,
    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
        if self.input_is_parallel:
            input_parallel = input_
@@ -1607,16 +1606,7 @@ class RowParallelLinear(LinearBase):
        # Only fuse bias add into GEMM for rank 0 (this ensures that
        # bias will not get added more than once in TP>1 case)
        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
-        if use_fused_silu_mul_quant:
-            xq, xs = lm_fuse_silu_mul_quant(input_parallel)
-            
-            silu_quant_args = [xq, xs]
-            output_parallel = self.quant_method.apply(self,
-                                                      input_parallel,
-                                                      bias_,
-                                                      silu_quant_args=silu_quant_args)
-        else:
-            output_parallel = self.quant_method.apply(self, input_parallel, bias_)
+        output_parallel = self.quant_method.apply(self, input_parallel, bias_)

        if self.reduce_results and self.tp_size > 1:
            output = tensor_model_parallel_all_reduce(output_parallel)

--- a/vllm/model_executor/layers/quantization/slimquant_w4a8.py
+++ b/vllm/model_executor/layers/quantization/slimquant_w4a8.py
@@ -159,14 +159,7 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
        input_quant_args: Optional[list[torch.Tensor]] = None,
        silu_quant_args: Optional[list[torch.Tensor]] = None 
    ):
-        if envs.USE_FUSED_RMS_QUANT and input_quant_args is not None:
-            assert len(input_quant_args) == 2
-            x_q, x_scale = input_quant_args
-        elif envs.USE_FUSED_SILU_MUL_QUANT and silu_quant_args is not None:
-            assert len(silu_quant_args) == 2
-            x_q, x_scale = silu_quant_args
-        else:
-            x_q, x_scale = per_token_quant_int8(x)
+        x_q, x_scale = per_token_quant_int8(x)

        if self.w8a8_strategy==1:
            m=x_q.shape[0]

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -196,11 +196,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                    os.environ['VLLM_USE_LIGHTOP'] = '1'
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
-                if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
-                    if not envs.is_set("USE_FUSED_RMS_QUANT"):
-                        os.environ['USE_FUSED_RMS_QUANT'] = '1'
-                    if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
-                        os.environ['USE_FUSED_SILU_MUL_QUANT'] = '1'
+                # if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
+                #     if not envs.is_set("USE_FUSED_RMS_QUANT"):
+                #         os.environ['USE_FUSED_RMS_QUANT'] = '1'
+                #     if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
+                #         os.environ['USE_FUSED_SILU_MUL_QUANT'] = '1'
            else:
                if not envs.is_set("VLLM_USE_PD_SPLIT"):
                    os.environ['VLLM_USE_PD_SPLIT'] = '1'
@@ -228,11 +228,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module],
                    os.environ['VLLM_USE_LIGHTOP'] = '1'
                if not envs.is_set("VLLM_USE_OPT_CAT"):
                    os.environ['VLLM_USE_OPT_CAT'] = '1'
-                if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
-                    if not envs.is_set("USE_FUSED_RMS_QUANT"):
-                        os.environ['USE_FUSED_RMS_QUANT'] = '1'
-                    if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
-                        os.environ['USE_FUSED_SILU_MUL_QUANT'] = '1'
+                # if model_config.quantization in {"slimquant_w4a8", "slimquant_w4a8_marlin", "slimquant_compressed_tensors_marlin", "compressed-tensors"}:
+                #     if not envs.is_set("USE_FUSED_RMS_QUANT"):
+                #         os.environ['USE_FUSED_RMS_QUANT'] = '1'
+                #     if not envs.is_set("USE_FUSED_SILU_MUL_QUANT"):
+                #         os.environ['USE_FUSED_SILU_MUL_QUANT'] = '1'
            else:
                if not envs.is_set("VLLM_USE_PD_SPLIT"):
                    os.environ['VLLM_USE_PD_SPLIT'] = '1'

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -232,24 +232,11 @@ class DeepseekV2MLP(nn.Module):
            )
        self.act_fn = SiluAndMul()

-    def forward(self, x,
-                rms_weight: torch.Tensor | None = None,
-                residual: torch.Tensor | None = None,
-                update_hd: bool | None = False
-                ):
-        if envs.USE_FUSED_RMS_QUANT:
-            gate_up, new_resi, _  = self.gate_up_proj(x, rms_weight, residual, update_hd=update_hd)
-            if envs.USE_FUSED_SILU_MUL_QUANT:
-                x, _ = self.down_proj(gate_up, use_fused_silu_mul_quant=True)
-            else:
-                x = self.act_fn(gate_up)
-                x, _ = self.down_proj(x)
-            return x, new_resi
-        else:
-            gate_up, _ = self.gate_up_proj(x)
-            x = self.act_fn(gate_up)
-            x, _ = self.down_proj(x)
-            return x
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x


 class DeepseekV2MoE(nn.Module):