remove USE_FUSED_RMS_QUANT and USE_FUSED_SILU_MUL_QUANT

c2ef7fdd · zhuwenwen · 383f2ce8 · c2ef7fdd · c2ef7fdd · c2ef7fdd
Commit c2ef7fdd authored Jan 07, 2026 by zhuwenwen
5 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -233,8 +233,6 @@ if TYPE_CHECKING:
    VLLM_USE_LIGHTOP_MOE_SUM: bool = False
    VLLM_USE_LIGHTOP_MOE_ALIGN: bool = False
    VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
-    USE_FUSED_RMS_QUANT: bool = False
-    USE_FUSED_SILU_MUL_QUANT: bool = False
    VLLM_USE_PD_SPLIT: bool = False
    VLLM_USE_PP_SYNC: bool = False
    VLLM_USE_PIECEWISE: bool = False
@@ -1639,14 +1637,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_MERGE_ATTN_STATES_OPT":
        lambda: (os.environ.get("VLLM_USE_MERGE_ATTN_STATES_OPT", "True").lower() in
                 ("true", "1")),  
-    # vllm will use rmsquant fused op 
-    "USE_FUSED_RMS_QUANT": 
-    lambda: (os.getenv('USE_FUSED_RMS_QUANT', '0').lower() in
-             ("true", "1")),
-    # vllm will use silu_mul_quant fused op 
-    "USE_FUSED_SILU_MUL_QUANT": 
-    lambda: (os.getenv('USE_FUSED_SILU_MUL_QUANT', '0').lower() in
-             ("true", "1")),
    # vLLM will split prefill and decode, not mix up
    "VLLM_USE_PD_SPLIT":
        lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "False").lower() in

--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -35,18 +35,6 @@ from vllm.utils import GiB_bytes
 import os
 from vllm.model_executor.utils import gemm_bank_conf
-if envs.USE_FUSED_RMS_QUANT:
-    try:
-        from lmslim.quantize.quant_ops import lm_faster_rmsquant
-    except Exception as e:
-        print(f"Error: Import fused rmsquant error: {e}") 
-if envs.USE_FUSED_SILU_MUL_QUANT:        
-    try:
-        # from lightop import fuse_silu_mul_quant
-        from lmslim.quantize.quant_ops import lm_fuse_silu_mul_quant
-    except Exception as e:
-        print(f"Error: Import fused silu_mul_qunat error: {e}")
 logger = init_logger(__name__)
@@ -441,51 +429,15 @@ class ReplicatedLinear(LinearBase):
        param.data.copy_(loaded_weight)
    def forward(
        self,
-        input_: torch.Tensor,
+        x: torch.Tensor,
-        rms_weight: Optional[torch.Tensor] = None,
-        residual: Optional[torch.Tensor] = None,
-        quant_args: Optional[list] = None,
-        update_hd: Optional[bool] = True
    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
-        if envs.USE_FUSED_RMS_QUANT and (rms_weight is not None or quant_args is not None):
-            if quant_args is not None:
-                input_quant_args = quant_args
        bias = self.bias if not self.skip_bias_add else None
        assert self.quant_method is not None
-                output = self.quant_method.apply(self, input_, bias, input_quant_args)
-                output_bias = self.bias if self.skip_bias_add else None
-                if not self.return_bias:
-                    return output
-                return output, output_bias
-            else:
-                i_q, _scales = lm_faster_rmsquant(input=input_,
-                                                  rms_weight=rms_weight,
-                                                  epsilon=self.eps,
-                                                  quant_dtype=torch.int8,
-                                                  residual=residual,
-                                                  update_input=update_hd
-                                                  )
-                new_residual = residual
+        output = self.quant_method.apply(self, x, bias)
-                input_quant_args = [i_q, _scales]
-                bias = self.bias if not self.skip_bias_add else None
-                assert self.quant_method is not None
-                output = self.quant_method.apply(self, input_, bias, input_quant_args)
        output_bias = self.bias if self.skip_bias_add else None
-                if not self.return_bias:
-                    return output
-                return output, new_residual, output_bias, input_quant_args
-        else:
-            bias = self.bias if not self.skip_bias_add else None
-            assert self.quant_method is not None
-            output = self.quant_method.apply(self, input_, bias)
-            output_bias = self.bias if self.skip_bias_add else None
        if not self.return_bias:
            return output
        return output, output_bias
@@ -652,39 +604,13 @@ class ColumnParallelLinear(LinearBase):
    def forward(
        self,
        input_,
-        rms_weight: Optional[torch.Tensor] = None,
-        residual: Optional[torch.Tensor] = None,
-        update_hd: Optional[bool] = True
    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
-        if envs.USE_FUSED_RMS_QUANT and rms_weight is not None:
-            input_quant_args = None
-            assert rms_weight is not None 
-            i_q, _scales = lm_faster_rmsquant(input=input_,
-                                        rms_weight=rms_weight,
-                                        epsilon=self.eps,
-                                        quant_dtype=torch.int8,
-                                        residual=residual,
-                                        update_input=update_hd)
-            new_residual = residual
-            input_quant_args = [i_q, _scales]
        bias = self.bias if not self.skip_bias_add else None
-            assert self.quant_method is not None
-            output_parallel = self.quant_method.apply(self, input_, bias, input_quant_args)
-            if self.gather_output and self.tp_size > 1:
-                output = tensor_model_parallel_all_gather(output_parallel)
-            else:
-                output = output_parallel
-            output_bias = self.bias if self.skip_bias_add else None
-            if not self.return_bias:
-                return output
-            return output, new_residual, output_bias
-        else:
-            bias = self.bias if not self.skip_bias_add else None
        # Matrix multiply.
        assert self.quant_method is not None
        output_parallel = self.quant_method.apply(self, input_, bias)
        if self.gather_output and self.tp_size > 1:
            # All-gather across the partitions.
            output = tensor_model_parallel_all_gather(output_parallel)
@@ -730,54 +656,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                    will be treated as a "Replicated" MergedLinear.
    """
-    def forward(
-        self, input_,
-        rms_weight: Optional[torch.Tensor] = None,
-        residual: Optional[torch.Tensor] = None,
-        update_hd: Optional[bool] = True
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
-        if envs.USE_FUSED_RMS_QUANT and rms_weight is not None:
-            input_quant_args = None
-            assert residual is not None and rms_weight is not None 
-            i_q, _scales = lm_faster_rmsquant(input=input_,
-                                        rms_weight=rms_weight,
-                                        epsilon=self.eps,
-                                        quant_dtype=torch.int8,
-                                        residual=residual,
-                                        update_input=update_hd)
-            new_residual = residual
-            input_quant_args = [i_q, _scales]
-            bias = self.bias if not self.skip_bias_add else None
-            assert self.quant_method is not None
-            output_parallel = self.quant_method.apply(self, input_, bias, input_quant_args)
-            if self.gather_output:
-                # All-gather across the partitions.
-                output = tensor_model_parallel_all_gather(output_parallel)
-            else:
-                output = output_parallel
-            output_bias = self.bias if self.skip_bias_add else None
-            if not self.return_bias:
-                return output
-            return output, new_residual, output_bias
-        else: # not USE_FUSED_RMS_QUANT
-            bias = self.bias if not self.skip_bias_add else None
-            assert self.quant_method is not None
-            output_parallel = self.quant_method.apply(self, input_, bias)
-            if self.gather_output and self.tp_size > 1:
-                # All-gather across the partitions.
-                output = tensor_model_parallel_all_gather(output_parallel)
-            else:
-                output = output_parallel
-            output_bias = self.bias if self.skip_bias_add else None
-            if not self.return_bias:
-                return output
-            return output, output_bias
    def __init__(
        self,
        input_size: int,
@@ -1548,7 +1426,6 @@ class RowParallelLinear(LinearBase):
    def forward(
        self,
        input_,
-        use_fused_silu_mul_quant: Optional[bool] = False
    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
        if self.input_is_parallel:
            input_parallel = input_
@@ -1562,15 +1439,6 @@ class RowParallelLinear(LinearBase):
        # Only fuse bias add into GEMM for rank 0 (this ensures that
        # bias will not get added more than once in TP>1 case)
        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
-        if use_fused_silu_mul_quant:
-            xq, xs = lm_fuse_silu_mul_quant(input_parallel)
-            silu_quant_args = [xq, xs]
-            output_parallel = self.quant_method.apply(self,
-                                                      input_parallel,
-                                                      bias_,
-                                                      silu_quant_args=silu_quant_args)
-        else:
        output_parallel = self.quant_method.apply(self, input_parallel, bias_)
        if self.reduce_results and self.tp_size > 1:

--- a/vllm/model_executor/layers/quantization/slimquant_w4a8.py
+++ b/vllm/model_executor/layers/quantization/slimquant_w4a8.py
@@ -155,15 +155,7 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: Optional[torch.Tensor] = None,
-        input_quant_args: Optional[list[torch.Tensor]] = None,
-        silu_quant_args: Optional[list[torch.Tensor]] = None 
    ):
-        if envs.USE_FUSED_RMS_QUANT and input_quant_args is not None:
-            assert len(input_quant_args) == 2
-            x_q, x_scale = input_quant_args
-        elif envs.USE_FUSED_SILU_MUL_QUANT and silu_quant_args is not None:
-            x_q, x_scale = silu_quant_args
-        else:
        x_q, x_scale = per_token_quant_int8(x)
        if self.w8a8_strategy==1:

--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -468,11 +468,7 @@ def apply_int8_linear(
    # ops.scaled_int8_quant supports both dynamic and static quant.
    # * dynamic, layer.input_scale is None and x_scale computed from x.
    # * static, layer.input_scale is scalar and x_scale is input_scale.
-    if envs.USE_FUSED_RMS_QUANT and input_quant_args is not None:
-        assert len(input_quant_args) == 2
-        x_zp =None
-        x_q, x_scale = input_quant_args
-    else: # not USE_FUSED_RMS_QUANT
    symmetric = azp_adj is None
    if input_scale is None and input_zero_point is None and symmetric is True:
        x_q, x_scale=per_token_quant_int8(input)

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -133,20 +133,7 @@ class DeepseekV2MLP(nn.Module):
                             "Only silu is supported for now.")
        self.act_fn = SiluAndMul()
-    def forward(self, x,
+    def forward(self, x):
-                rms_weight: Optional[torch.Tensor] = None,
-                residual: Optional[torch.Tensor] = None,
-                update_hd: Optional[bool] = False
-                ):
-        if envs.USE_FUSED_RMS_QUANT:
-            gate_up, new_resi, _  = self.gate_up_proj(x, rms_weight, residual, update_hd=update_hd)
-            if envs.USE_FUSED_SILU_MUL_QUANT:
-                x, _ = self.down_proj(gate_up, use_fused_silu_mul_quant=True)
-            else:
-                x = self.act_fn(gate_up)
-                x, _ = self.down_proj(x)
-            return x, new_resi
-        else:
        gate_up, _ = self.gate_up_proj(x)
        x = self.act_fn(gate_up)
        x, _ = self.down_proj(x)
@@ -1128,49 +1115,6 @@ class DeepseekV2DecoderLayer(nn.Module):
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor],
    ) -> torch.Tensor:
-        if envs.USE_FUSED_RMS_QUANT:
-            # Fix residual FP16 overflow
-            residual_fix_overflow = False
-            assert self.input_layernorm.has_weight is True
-            if residual is None:
-                residual = hidden_states
-                hidden_states, _ = self.self_attn(
-                    positions = positions,
-                    hidden_states = hidden_states,
-                    rms_weight = self.input_layernorm.weight.data,
-                    residual = None
-                )
-                residual_fix_overflow = True
-            else:
-                hidden_states, new_residual = self.self_attn(
-                    positions = positions,
-                    hidden_states = hidden_states,
-                    rms_weight = self.input_layernorm.weight.data,
-                    residual = residual
-                )
-                residual = new_residual
-            if hidden_states.dtype == torch.float16:
-                # rmsnorm, and rmsnorm result would not affect by scale.
-                hidden_states *= 1. / self.routed_scaling_factor
-                if self.layer_idx == 0 or residual_fix_overflow:
-                    # The residual is shared by all layers, we only scale it on
-                    # first layer.
-                    residual *= 1. / self.routed_scaling_factor
-            hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)
-            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
-                # Fix FP16 overflow
-                # Scaling the DeepseekV2MLP output, it is the input of
-                # input_layernorm of next decoder layer.
-                # The scaling of DeepseekV2MOE output would be done in the forward
-                # of DeepseekV2MOE
-                hidden_states *= 1. / self.routed_scaling_factor
-            return hidden_states, new_resi
-        else:
        # Self Attention
        # Fix residual FP16 overflow
        residual_fix_overflow = False