Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a5043e83
Commit
a5043e83
authored
Sep 25, 2025
by
wujl5
Browse files
DeepSeek-R1-Channel-INT8调用rmsquant融合。
parent
5eec6110
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
22 additions
and
14 deletions
+22
-14
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
...ers/quantization/compressed_tensors/compressed_tensors.py
+3
-2
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
...ompressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+4
-3
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+15
-9
No files found.
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
View file @
a5043e83
...
...
@@ -666,7 +666,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
):
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
"""
Use the output of create_weights and the CompressedTensorsScheme
associated with the layer to apply the forward pass with the
...
...
@@ -677,7 +678,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
scheme
=
layer
.
scheme
if
scheme
is
None
:
raise
ValueError
(
"A scheme must be defined for each layer"
)
return
scheme
.
apply_weights
(
layer
,
x
,
bias
=
bias
)
return
scheme
.
apply_weights
(
layer
,
x
,
bias
=
bias
,
input_quant_args
=
input_quant_args
)
class
CompressedTensorsKVCacheMethod
(
BaseKVCacheMethod
):
...
...
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
View file @
a5043e83
...
...
@@ -111,7 +111,8 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
self
.
kernel
.
process_weights_after_loading
(
layer
)
def
apply_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
])
->
torch
.
Tensor
:
bias
:
Optional
[
torch
.
Tensor
],
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
)
->
torch
.
Tensor
:
# return self.kernel.apply_weights(layer, x, bias)
...
...
@@ -122,5 +123,5 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
input_zero_point
=
layer
.
input_zero_point
,
azp_adj
=
layer
.
azp_adj
,
bias
=
bias
,
w8a8_strategy
=
self
.
w8a8_strategy
)
w8a8_strategy
=
self
.
w8a8_strategy
,
input_quant_args
=
input_quant_args
)
\ No newline at end of file
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
a5043e83
...
...
@@ -392,20 +392,26 @@ def apply_int8_linear(
azp_adj
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
w8a8_strategy
:
Optional
[
int
]
=
0
,
input_quant_args
:
Optional
[
list
[
torch
.
Tensor
]]
=
None
):
# ops.scaled_int8_quant supports both dynamic and static quant.
# * dynamic, layer.input_scale is None and x_scale computed from x.
# * static, layer.input_scale is scalar and x_scale is input_scale.
symmetric
=
azp_adj
is
None
if
input_scale
is
None
and
input_zero_point
is
None
and
symmetric
is
True
:
x_q
,
x_scale
=
per_token_quant_int8
(
input
)
x_zp
=
None
else
:
x_q
,
x_scale
,
x_zp
=
ops
.
scaled_int8_quant
(
input
,
input_scale
,
input_zero_point
,
symmetric
=
symmetric
)
if
envs
.
USE_FUSED_RMS_QUANT
and
input_quant_args
is
not
None
:
assert
len
(
input_quant_args
)
==
2
x_zp
=
None
x_q
,
x_scale
=
input_quant_args
else
:
# not USE_FUSED_RMS_QUANT
symmetric
=
azp_adj
is
None
if
input_scale
is
None
and
input_zero_point
is
None
and
symmetric
is
True
:
x_q
,
x_scale
=
per_token_quant_int8
(
input
)
x_zp
=
None
else
:
x_q
,
x_scale
,
x_zp
=
ops
.
scaled_int8_quant
(
input
,
input_scale
,
input_zero_point
,
symmetric
=
symmetric
)
if
x_zp
is
not
None
:
# Currently, static is always per-tensor and dynamic is per-token
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment