修改激活量化算子实现方式

b2fa85ce · zhuwenwen · ef6c0877 · b2fa85ce · b2fa85ce
Commit b2fa85ce authored Jul 07, 2025 by zhuwenwen
2 changed files
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -14,6 +14,8 @@ from vllm.platforms import current_platform
 from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
                                   ScaledMMLinearLayerConfig)

+from lmslim.layers.gemm.int8_utils import per_token_quant_int8
+

 class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):

@@ -112,10 +114,15 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
        # * dynamic, i_s is None and x_s computed from x.
        # * static, i_s is scalar and x_s is i_s.
        symmetric = azp_adj is None
-        x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(),
-                                               i_s,
-                                               i_zp,
-                                               symmetric=symmetric)
+        if i_s is None and i_zp is None and symmetric is True:
+            x_q, x_s=per_token_quant_int8(x)
+            x_zp =None
+
+        else:
+            x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(),
+                                                i_s,
+                                                i_zp,
+                                                symmetric=symmetric)

        if x_zp is not None:
            # Currently, static is always per-tensor and dynamic is per-token

--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -10,6 +10,7 @@ from vllm import envs
 from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.utils import W8a8GetCacheJSON
+from lmslim.layers.gemm.int8_utils import per_token_quant_int8

 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
@@ -396,10 +397,15 @@ def apply_int8_linear(
    # * dynamic, layer.input_scale is None and x_scale computed from x.
    # * static, layer.input_scale is scalar and x_scale is input_scale.
    symmetric = azp_adj is None
-    x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
-                                               input_scale,
-                                               input_zero_point,
-                                               symmetric=symmetric)
+    if input_scale is None and input_zero_point is None and symmetric is True:
+        x_q, x_scale=per_token_quant_int8(input)
+        x_zp =None
+
+    else:
+        x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
+                                                   input_scale,
+                                                   input_zero_point,
+                                                   symmetric=symmetric)

    if x_zp is not None:
        # Currently, static is always per-tensor and dynamic is per-token