deepseek_v2_w8a8 增加 silu_mul_quant融合

216e414b · wujl5 · zhuwenwen · 2b47bce9 · 216e414b · 216e414b
Commit 216e414b authored Nov 21, 2025 by wujl5 Committed by zhuwenwen Nov 21, 2025
5 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -172,7 +172,7 @@ if TYPE_CHECKING:
    VLLM_USE_LIGHTOP_MOE_SUM: bool = False
    VLLM_USE_LIGHTOP_MOE_ALIGN: bool = False
    VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
-    USE_FUSED_RMS_QUANT: bool = False
+    USE_FUSED_RMS_QUANT: bool = True
    USE_FUSED_SILU_MUL_QUANT: bool = True
    VLLM_P2P_ASYNC: bool = False
    VLLM_P2P_BUF_TOKENS: int = 30000
@@ -1142,8 +1142,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
                 ("true", "1")),  
    # vllm will use rmsquant fused op 
    "USE_FUSED_RMS_QUANT": 
-    lambda: (os.getenv('USE_FUSED_RMS_QUANT', '0').lower() in
-             ("true", "1")),
+    lambda: bool(int(os.getenv("USE_FUSED_RMS_QUANT", "1"))),
    # vllm will use silu_mul_quant fused op,
    # This variable has a default value of true, 
    # but it is still controlled by CRQ and RQ.

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -669,7 +669,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None, 
-              input_quant_args: Optional[list[torch.Tensor]] = None):
+              input_quant_args: Optional[list[torch.Tensor]] = None,
+              silu_quant_args: Optional[list[torch.Tensor]] = None):
        """
        Use the output of create_weights and the CompressedTensorsScheme
        associated with the layer to apply the forward pass with the
@@ -680,7 +681,10 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
        scheme = layer.scheme
        if scheme is None:
            raise ValueError("A scheme must be defined for each layer")
-        return scheme.apply_weights(layer, x, bias=bias, input_quant_args=input_quant_args)
+        return scheme.apply_weights(layer, x, 
+                                    bias=bias, 
+                                    input_quant_args=input_quant_args, 
+                                    silu_quant_args=silu_quant_args)


 class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):

--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -112,7 +112,9 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):

    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
                      bias: Optional[torch.Tensor], 
-                      input_quant_args: Optional[list[torch.Tensor]] = None) -> torch.Tensor:
+                      input_quant_args: Optional[list[torch.Tensor]] = None,
+                      silu_quant_args: Optional[list[torch.Tensor]] = None
+                      ) -> torch.Tensor:
        
        # return self.kernel.apply_weights(layer, x, bias)

@@ -124,4 +126,5 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
                                 azp_adj=layer.azp_adj,
                                 bias=bias,
                                 w8a8_strategy=self.w8a8_strategy,
-                                 input_quant_args=input_quant_args)
\ No newline at end of file
+                                 input_quant_args=input_quant_args,
+                                 silu_quant_args=silu_quant_args)
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/slimquant_w4a8.py
+++ b/vllm/model_executor/layers/quantization/slimquant_w4a8.py
@@ -168,6 +168,7 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):
            assert len(input_quant_args) == 2
            x_q, x_scale = input_quant_args
        elif envs.USE_FUSED_CUSTOM_ALL_REDUCE_RMS_QUANT and silu_quant_args is not None:
+            assert len(silu_quant_args) == 2
            x_q, x_scale = silu_quant_args
        else:
            x_q, x_scale = per_token_quant_int8(x)

--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -406,7 +406,8 @@ def apply_int8_linear(
    azp_adj: Optional[torch.Tensor] = None,
    bias: Optional[torch.Tensor] = None,
    w8a8_strategy:Optional[int]=0,
-    input_quant_args: Optional[list[torch.Tensor]] = None
+    input_quant_args: Optional[list[torch.Tensor]] = None,
+    silu_quant_args: Optional[list[torch.Tensor]] = None
 ):
    # ops.scaled_int8_quant supports both dynamic and static quant.
    # * dynamic, layer.input_scale is None and x_scale computed from x.
@@ -416,7 +417,11 @@ def apply_int8_linear(
        assert len(input_quant_args) == 2
        x_zp =None
        x_q, x_scale = input_quant_args
-    else: # not USE_FUSED_RMS_QUANT
+    elif envs.USE_FUSED_RMS_QUANT and silu_quant_args is not None:
+        assert len(silu_quant_args) == 2
+        x_zp =None
+        x_q, x_scale = silu_quant_args
+    else: # default
        symmetric = azp_adj is None
        if input_scale is None and input_zero_point is None and symmetric is True:
            x_q, x_scale=per_token_quant_int8(input)