"vscode:/vscode.git/clone" did not exist on "383f2ce8270644d7d54367bfce8d19ff6a487e2c"
Commit b2fa85ce authored by zhuwenwen's avatar zhuwenwen
Browse files

修改激活量化算子实现方式

parent ef6c0877
......@@ -14,6 +14,8 @@ from vllm.platforms import current_platform
from .ScaledMMLinearKernel import (ScaledMMLinearKernel,
ScaledMMLinearLayerConfig)
from lmslim.layers.gemm.int8_utils import per_token_quant_int8
class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
......@@ -112,10 +114,15 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
# * dynamic, i_s is None and x_s computed from x.
# * static, i_s is scalar and x_s is i_s.
symmetric = azp_adj is None
x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(),
i_s,
i_zp,
symmetric=symmetric)
if i_s is None and i_zp is None and symmetric is True:
x_q, x_s=per_token_quant_int8(x)
x_zp =None
else:
x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(),
i_s,
i_zp,
symmetric=symmetric)
if x_zp is not None:
# Currently, static is always per-tensor and dynamic is per-token
......
......@@ -10,6 +10,7 @@ from vllm import envs
from vllm.config import CompilationLevel, get_current_vllm_config
from vllm.platforms import current_platform
from vllm.utils import W8a8GetCacheJSON
from lmslim.layers.gemm.int8_utils import per_token_quant_int8
# Input scaling factors are no longer optional in _scaled_mm starting
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
......@@ -396,10 +397,15 @@ def apply_int8_linear(
# * dynamic, layer.input_scale is None and x_scale computed from x.
# * static, layer.input_scale is scalar and x_scale is input_scale.
symmetric = azp_adj is None
x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
input_scale,
input_zero_point,
symmetric=symmetric)
if input_scale is None and input_zero_point is None and symmetric is True:
x_q, x_scale=per_token_quant_int8(input)
x_zp =None
else:
x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
input_scale,
input_zero_point,
symmetric=symmetric)
if x_zp is not None:
# Currently, static is always per-tensor and dynamic is per-token
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment