Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b2fa85ce
"ssh:/git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "a01f2faedfab1fa7f73ed0ca396370791c20daa3"
Commit
b2fa85ce
authored
Jul 07, 2025
by
zhuwenwen
Browse files
修改激活量化算子实现方式
parent
ef6c0877
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
21 additions
and
8 deletions
+21
-8
vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
...executor/layers/quantization/kernels/scaled_mm/cutlass.py
+11
-4
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+10
-4
No files found.
vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
View file @
b2fa85ce
...
@@ -14,6 +14,8 @@ from vllm.platforms import current_platform
...
@@ -14,6 +14,8 @@ from vllm.platforms import current_platform
from
.ScaledMMLinearKernel
import
(
ScaledMMLinearKernel
,
from
.ScaledMMLinearKernel
import
(
ScaledMMLinearKernel
,
ScaledMMLinearLayerConfig
)
ScaledMMLinearLayerConfig
)
from
lmslim.layers.gemm.int8_utils
import
per_token_quant_int8
class
CutlassScaledMMLinearKernel
(
ScaledMMLinearKernel
):
class
CutlassScaledMMLinearKernel
(
ScaledMMLinearKernel
):
...
@@ -112,10 +114,15 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
...
@@ -112,10 +114,15 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
# * dynamic, i_s is None and x_s computed from x.
# * dynamic, i_s is None and x_s computed from x.
# * static, i_s is scalar and x_s is i_s.
# * static, i_s is scalar and x_s is i_s.
symmetric
=
azp_adj
is
None
symmetric
=
azp_adj
is
None
x_q
,
x_s
,
x_zp
=
ops
.
scaled_int8_quant
(
x
.
contiguous
(),
if
i_s
is
None
and
i_zp
is
None
and
symmetric
is
True
:
i_s
,
x_q
,
x_s
=
per_token_quant_int8
(
x
)
i_zp
,
x_zp
=
None
symmetric
=
symmetric
)
else
:
x_q
,
x_s
,
x_zp
=
ops
.
scaled_int8_quant
(
x
.
contiguous
(),
i_s
,
i_zp
,
symmetric
=
symmetric
)
if
x_zp
is
not
None
:
if
x_zp
is
not
None
:
# Currently, static is always per-tensor and dynamic is per-token
# Currently, static is always per-tensor and dynamic is per-token
...
...
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
b2fa85ce
...
@@ -10,6 +10,7 @@ from vllm import envs
...
@@ -10,6 +10,7 @@ from vllm import envs
from
vllm.config
import
CompilationLevel
,
get_current_vllm_config
from
vllm.config
import
CompilationLevel
,
get_current_vllm_config
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
W8a8GetCacheJSON
from
vllm.utils
import
W8a8GetCacheJSON
from
lmslim.layers.gemm.int8_utils
import
per_token_quant_int8
# Input scaling factors are no longer optional in _scaled_mm starting
# Input scaling factors are no longer optional in _scaled_mm starting
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
...
@@ -396,10 +397,15 @@ def apply_int8_linear(
...
@@ -396,10 +397,15 @@ def apply_int8_linear(
# * dynamic, layer.input_scale is None and x_scale computed from x.
# * dynamic, layer.input_scale is None and x_scale computed from x.
# * static, layer.input_scale is scalar and x_scale is input_scale.
# * static, layer.input_scale is scalar and x_scale is input_scale.
symmetric
=
azp_adj
is
None
symmetric
=
azp_adj
is
None
x_q
,
x_scale
,
x_zp
=
ops
.
scaled_int8_quant
(
input
,
if
input_scale
is
None
and
input_zero_point
is
None
and
symmetric
is
True
:
input_scale
,
x_q
,
x_scale
=
per_token_quant_int8
(
input
)
input_zero_point
,
x_zp
=
None
symmetric
=
symmetric
)
else
:
x_q
,
x_scale
,
x_zp
=
ops
.
scaled_int8_quant
(
input
,
input_scale
,
input_zero_point
,
symmetric
=
symmetric
)
if
x_zp
is
not
None
:
if
x_zp
is
not
None
:
# Currently, static is always per-tensor and dynamic is per-token
# Currently, static is always per-tensor and dynamic is per-token
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment