remove USE_FUSED_RMS_QUANT

8cfec41a · zhuwenwen · 60b37c6b · 8cfec41a · 8cfec41a · 8cfec41a
Commit 8cfec41a authored Jan 07, 2026 by zhuwenwen
4 changed files
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1984,8 +1984,6 @@ class FusedMoE(CustomOp):
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
-        i_q: torch.Tensor | None = None,
-        i_s: torch.Tensor | None= None, **_
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        assert self.quant_method is not None

@@ -2053,19 +2051,6 @@ class FusedMoE(CustomOp):
                )

            # Matrix multiply.
-            if envs.USE_FUSED_RMS_QUANT:
-                final_hidden_states = self.quant_method.apply(
-                    layer=self,
-                    x=hidden_states_combined
-                    if do_naive_dispatch_combine
-                    else hidden_states,
-                    router_logits=router_logits,
-                    use_nn_moe=self.use_nn_moe,
-                    use_fused_gate=self.use_fused_gate,
-                    i_q=i_q,
-                    i_s=i_s,
-                )
-            else:
            final_hidden_states = self.quant_method.apply(
                layer=self,
                x=hidden_states_combined
@@ -2186,15 +2171,10 @@ def moe_forward(
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    layer_name: str,
-    i_q: torch.Tensor | None = None,
-    i_s: torch.Tensor | None = None, 
 ) -> torch.Tensor:
    forward_context: ForwardContext = get_forward_context()
    self = forward_context.no_compile_layers[layer_name]
    assert self.shared_experts is None
-    if envs.USE_FUSED_RMS_QUANT:
-        return self.forward_impl(hidden_states, router_logits, i_q, i_s)
-    else:
    return self.forward_impl(hidden_states, router_logits)



--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -41,7 +41,6 @@ import os
 from vllm.model_executor.utils import gemm_bank_conf

 from lmslim.quantize.quant_ops import lm_faster_rmsquant
-from lmslim.quantize.quant_ops import lm_fuse_silu_mul_quant

 logger = init_logger(__name__)

@@ -450,49 +449,14 @@ class ReplicatedLinear(LinearBase):

    def forward(
        self,
-        input_: torch.Tensor,
-        rms_weight: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-        quant_args: list | None = None,
-        update_hd: bool | None= True,
+        x: torch.Tensor,
    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
-        if envs.USE_FUSED_RMS_QUANT and (rms_weight is not None or quant_args is not None):
-            if quant_args is not None:
-                input_quant_args = quant_args
-            
        bias = self.bias if not self.skip_bias_add else None
        assert self.quant_method is not None
-                output = self.quant_method.apply(self, input_, bias, input_quant_args)
-                output_bias = self.bias if self.skip_bias_add else None
-                if not self.return_bias:
-                    return output
-                return output, output_bias

-            else:
-                i_q, _scales = lm_faster_rmsquant(input=input_,
-                                                  rms_weight=rms_weight,
-                                                  epsilon=self.eps,
-                                                  quant_dtype=torch.int8,
-                                                  residual=residual,
-                                                  update_input=update_hd
-                                                  )
-            
-                new_residual = residual
-                input_quant_args = [i_q, _scales]
-                
-                bias = self.bias if not self.skip_bias_add else None
-                assert self.quant_method is not None
-                output = self.quant_method.apply(self, input_, bias, input_quant_args)
+        output = self.quant_method.apply(self, x, bias)
        output_bias = self.bias if self.skip_bias_add else None
-                if not self.return_bias:
-                    return output
-                return output, new_residual, output_bias, input_quant_args

-        else:
-            bias = self.bias if not self.skip_bias_add else None
-            assert self.quant_method is not None
-            output = self.quant_method.apply(self, input_, bias)
-            output_bias = self.bias if self.skip_bias_add else None
        if not self.return_bias:
            return output
        return output, output_bias
@@ -660,39 +624,13 @@ class ColumnParallelLinear(LinearBase):
    def forward(
        self,
        input_,
-        rms_weight: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-        update_hd: bool | None = True
    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
-        if envs.USE_FUSED_RMS_QUANT and rms_weight is not None:
-            input_quant_args = None
-            assert rms_weight is not None 
-            i_q, _scales = lm_faster_rmsquant(input=input_,
-                                        rms_weight=rms_weight,
-                                        epsilon=self.eps,
-                                        quant_dtype=torch.int8,
-                                        residual=residual,
-                                        update_input=update_hd)
-            new_residual = residual
-            input_quant_args = [i_q, _scales]
-        
        bias = self.bias if not self.skip_bias_add else None

-            assert self.quant_method is not None
-            output_parallel = self.quant_method.apply(self, input_, bias, input_quant_args)
-            if self.gather_output and self.tp_size > 1:
-                output = tensor_model_parallel_all_gather(output_parallel)
-            else:
-                output = output_parallel
-            output_bias = self.bias if self.skip_bias_add else None
-            if not self.return_bias:
-                return output
-            return output, new_residual, output_bias
-        else:
-            bias = self.bias if not self.skip_bias_add else None
        # Matrix multiply.
        assert self.quant_method is not None
        output_parallel = self.quant_method.apply(self, input_, bias)
+
        if self.gather_output and self.tp_size > 1:
            # All-gather across the partitions.
            output = tensor_model_parallel_all_gather(output_parallel)
@@ -738,54 +676,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                    will be treated as a "Replicated" MergedLinear.
    """

-    def forward(
-        self, input_,
-        rms_weight: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-        update_hd:  bool | None = True
-    ) -> torch.Tensor | tuple[torch.Tensor, Parameter] | None:
-        if envs.USE_FUSED_RMS_QUANT and rms_weight is not None:
-            input_quant_args = None
-            assert residual is not None and rms_weight is not None 
-            i_q, _scales = lm_faster_rmsquant(input=input_,
-                                        rms_weight=rms_weight,
-                                        epsilon=self.eps,
-                                        quant_dtype=torch.int8,
-                                        residual=residual,
-                                        update_input=update_hd)
-            
-            new_residual = residual
-            input_quant_args = [i_q, _scales]
-            
-            
-            bias = self.bias if not self.skip_bias_add else None
-            assert self.quant_method is not None
-            output_parallel = self.quant_method.apply(self, input_, bias, input_quant_args)
-            
-            if self.gather_output:
-                # All-gather across the partitions.
-                output = tensor_model_parallel_all_gather(output_parallel)
-            else:
-                output = output_parallel
-            output_bias = self.bias if self.skip_bias_add else None
-            if not self.return_bias:
-                return output
-            return output, new_residual, output_bias
-        else: 
-            bias = self.bias if not self.skip_bias_add else None
-
-            assert self.quant_method is not None
-            output_parallel = self.quant_method.apply(self, input_, bias)
-            if self.gather_output and self.tp_size > 1:
-                # All-gather across the partitions.
-                output = tensor_model_parallel_all_gather(output_parallel)
-            else:
-                output = output_parallel
-            output_bias = self.bias if self.skip_bias_add else None
-            if not self.return_bias:
-                return output
-            return output, output_bias
-
    def __init__(
        self,
        input_size: int,

--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -532,15 +532,6 @@ def apply_int8_linear(
    # ops.scaled_int8_quant supports both dynamic and static quant.
    # * dynamic, layer.input_scale is None and x_scale computed from x.
    # * static, layer.input_scale is scalar and x_scale is input_scale.
-    if envs.USE_FUSED_RMS_QUANT and input_quant_args is not None:
-        assert len(input_quant_args) == 2
-        x_zp =None
-        x_q, x_scale = input_quant_args
-    elif envs.USE_FUSED_RMS_QUANT and silu_quant_args is not None:
-        assert len(silu_quant_args) == 2
-        x_zp =None
-        x_q, x_scale = silu_quant_args
-    else: # not USE_FUSED_RMS_QUANT
    symmetric = azp_adj is None
    if input_scale is None and input_zero_point is None and symmetric is True:
        x_q, x_scale=per_token_quant_int8(input)

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1219,49 +1219,6 @@ class DeepseekV2DecoderLayer(nn.Module):
        residual: torch.Tensor | None,
        llama_4_scaling: torch.Tensor | None = None,
    ) -> torch.Tensor:
-        if envs.USE_FUSED_RMS_QUANT:
-            # Fix residual FP16 overflow
-            residual_fix_overflow = False
-            
-            assert self.input_layernorm.has_weight is True
-            if residual is None:
-                residual = hidden_states
-                hidden_states, _ = self.self_attn(
-                    positions = positions,
-                    hidden_states = hidden_states,
-                    rms_weight = self.input_layernorm.weight.data,
-                    residual = None
-                )
-                residual_fix_overflow = True
-            else:
-                hidden_states, new_residual = self.self_attn(
-                    positions = positions,
-                    hidden_states = hidden_states,
-                    rms_weight = self.input_layernorm.weight.data,
-                    residual = residual
-                )
-                residual = new_residual
-                
-            if hidden_states.dtype == torch.float16:
-                # rmsnorm, and rmsnorm result would not affect by scale.
-                hidden_states *= 1. / self.routed_scaling_factor
-                if self.layer_idx == 0 or residual_fix_overflow:
-                    # The residual is shared by all layers, we only scale it on
-                    # first layer.
-                    residual *= 1. / self.routed_scaling_factor
-
-            hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)
-
-            if isinstance(self.mlp,
-                        DeepseekV2MLP) and hidden_states.dtype == torch.float16:
-                # Fix FP16 overflow
-                # Scaling the DeepseekV2MLP output, it is the input of
-                # input_layernorm of next decoder layer.
-                # The scaling of DeepseekV2MOE output would be done in the forward
-                # of DeepseekV2MOE
-                hidden_states *= 1. / self.routed_scaling_factor
-            return hidden_states, new_resi
-        else:
        # Self Attention
        # Fix residual FP16 overflow
        residual_fix_overflow = False