Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9f087f8b
Commit
9f087f8b
authored
Sep 30, 2025
by
zhuwenwen
Browse files
DeepSeek-R1-Channel-INT8调用rmsquant融合
parent
1e911dbd
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
21 additions
and
13 deletions
+21
-13
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
...ers/quantization/compressed_tensors/compressed_tensors.py
+3
-2
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
...ompressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+4
-2
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+14
-9
No files found.
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
View file @
9f087f8b
...
@@ -767,7 +767,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
...
@@ -767,7 +767,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
def
apply
(
self
,
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
):
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
"""
"""
Use the output of create_weights and the CompressedTensorsScheme
Use the output of create_weights and the CompressedTensorsScheme
associated with the layer to apply the forward pass with the
associated with the layer to apply the forward pass with the
...
@@ -777,7 +778,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
...
@@ -777,7 +778,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
scheme
=
layer
.
scheme
scheme
=
layer
.
scheme
if
scheme
is
None
:
if
scheme
is
None
:
raise
ValueError
(
"A scheme must be defined for each layer"
)
raise
ValueError
(
"A scheme must be defined for each layer"
)
return
scheme
.
apply_weights
(
layer
,
x
,
bias
=
bias
)
return
scheme
.
apply_weights
(
layer
,
x
,
bias
=
bias
,
input_quant_args
=
input_quant_args
)
class
CompressedTensorsKVCacheMethod
(
BaseKVCacheMethod
):
class
CompressedTensorsKVCacheMethod
(
BaseKVCacheMethod
):
...
...
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
View file @
9f087f8b
...
@@ -111,7 +111,8 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
...
@@ -111,7 +111,8 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
self
.
kernel
.
process_weights_after_loading
(
layer
)
self
.
kernel
.
process_weights_after_loading
(
layer
)
def
apply_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
def
apply_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
])
->
torch
.
Tensor
:
bias
:
Optional
[
torch
.
Tensor
],
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
)
->
torch
.
Tensor
:
# return self.kernel.apply_weights(layer, x, bias)
# return self.kernel.apply_weights(layer, x, bias)
...
@@ -122,5 +123,6 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
...
@@ -122,5 +123,6 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
input_zero_point
=
layer
.
input_zero_point
,
input_zero_point
=
layer
.
input_zero_point
,
azp_adj
=
layer
.
azp_adj
,
azp_adj
=
layer
.
azp_adj
,
bias
=
bias
,
bias
=
bias
,
w8a8_strategy
=
self
.
w8a8_strategy
)
w8a8_strategy
=
self
.
w8a8_strategy
,
input_quant_args
=
input_quant_args
)
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
9f087f8b
...
@@ -447,20 +447,25 @@ def apply_int8_linear(
...
@@ -447,20 +447,25 @@ def apply_int8_linear(
azp_adj
:
Optional
[
torch
.
Tensor
]
=
None
,
azp_adj
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
w8a8_strategy
:
Optional
[
int
]
=
0
,
w8a8_strategy
:
Optional
[
int
]
=
0
,
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
):
# ops.scaled_int8_quant supports both dynamic and static quant.
# ops.scaled_int8_quant supports both dynamic and static quant.
# * dynamic, layer.input_scale is None and x_scale computed from x.
# * dynamic, layer.input_scale is None and x_scale computed from x.
# * static, layer.input_scale is scalar and x_scale is input_scale.
# * static, layer.input_scale is scalar and x_scale is input_scale.
symmetric
=
azp_adj
is
None
if
envs
.
USE_FUSED_RMS_QUANT
and
input_quant_args
is
not
None
:
if
input_scale
is
None
and
input_zero_point
is
None
and
symmetric
is
True
:
assert
len
(
input_quant_args
)
==
2
x_q
,
x_scale
=
per_token_quant_int8
(
input
)
x_zp
=
None
x_zp
=
None
x_q
,
x_scale
=
input_quant_args
else
:
else
:
# not USE_FUSED_RMS_QUANT
x_q
,
x_scale
,
x_zp
=
ops
.
scaled_int8_quant
(
input
,
symmetric
=
azp_adj
is
None
input_scale
,
if
input_scale
is
None
and
input_zero_point
is
None
and
symmetric
is
True
:
input_zero_point
,
x_q
,
x_scale
=
per_token_quant_int8
(
input
)
symmetric
=
symmetric
)
x_zp
=
None
else
:
x_q
,
x_scale
,
x_zp
=
ops
.
scaled_int8_quant
(
input
,
input_scale
,
input_zero_point
,
symmetric
=
symmetric
)
if
x_zp
is
not
None
:
if
x_zp
is
not
None
:
# Currently, static is always per-tensor and dynamic is per-token
# Currently, static is always per-tensor and dynamic is per-token
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment