Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8b1e4ef0
Commit
8b1e4ef0
authored
Aug 07, 2025
by
gaoqiong
Browse files
修改增加lmslimquant_w4a8量化支持
parent
cc6f327a
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
15 deletions
+22
-15
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/__init__.py
+3
-3
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
...quantization/compressed_tensors/compressed_tensors_moe.py
+8
-1
vllm/model_executor/layers/quantization/slimquant_w4a8.py
vllm/model_executor/layers/quantization/slimquant_w4a8.py
+10
-10
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+1
-1
No files found.
vllm/model_executor/layers/quantization/__init__.py
View file @
8b1e4ef0
...
...
@@ -37,7 +37,7 @@ QuantizationMethods = Literal[
"auto-round"
,
"rtn"
,
"blockwise_int8"
,
"
w8a8_int
8"
"
slimquant_w4a
8"
]
QUANTIZATION_METHODS
:
list
[
str
]
=
list
(
get_args
(
QuantizationMethods
))
...
...
@@ -117,7 +117,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
from
.torchao
import
TorchAOConfig
from
.tpu_int8
import
Int8TpuConfig
from
.blockwise_int8
import
BlockInt8Config
from
.
w8a8_int8
import
W8
A8Int8Config
from
.
slimquant_w4a8
import
SlimQuantW4
A8Int8Config
method_to_config
:
dict
[
str
,
type
[
QuantizationConfig
]]
=
{
"aqlm"
:
AQLMConfig
,
...
...
@@ -150,7 +150,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
"auto-round"
:
AutoRoundConfig
,
"rtn"
:
RTNConfig
,
"blockwise_int8"
:
BlockInt8Config
,
"
w8a8_int8"
:
W8
A8Int8Config
,
"
slimquant_w4a8"
:
SlimQuantW4
A8Int8Config
,
}
# Update the `method_to_config` with customized quantization methods.
method_to_config
.
update
(
_CUSTOMIZED_METHOD_TO_QUANT_CONFIG
)
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
View file @
8b1e4ef0
...
...
@@ -1000,6 +1000,8 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
raise
ValueError
(
"For INT8 Fused MoE layers, we require channelwise, "
"dynamic per token quantization. Found static input scales."
)
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
num_experts
:
int
,
hidden_size
:
int
,
intermediate_size_per_partition
:
int
,
...
...
@@ -1089,6 +1091,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
apply_router_weight_on_input
:
bool
=
False
,
activation
:
str
=
"silu"
,
enable_eplb
:
bool
=
False
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
use_fused_gate
:
Optional
[
bool
]
=
False
,
expert_load_view
:
Optional
[
torch
.
Tensor
]
=
None
,
logical_to_physical_map
:
Optional
[
torch
.
Tensor
]
=
None
,
logical_replica_count
:
Optional
[
torch
.
Tensor
]
=
None
,
...
...
@@ -1111,6 +1116,8 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
num_expert_group
=
num_expert_group
,
custom_routing_function
=
custom_routing_function
,
scoring_func
=
scoring_func
,
routed_scaling_factor
=
routed_scaling_factor
,
use_fused_gate
=
use_fused_gate
,
e_score_correction_bias
=
e_score_correction_bias
)
return
fused_experts
(
...
...
vllm/model_executor/layers/quantization/
w8a8_int
8.py
→
vllm/model_executor/layers/quantization/
slimquant_w4a
8.py
View file @
8b1e4ef0
...
...
@@ -40,7 +40,7 @@ def baseline_scaled_mm(a: torch.Tensor,
return
output
.
to
(
out_dtype
)
class
W8
A8Int8Config
(
QuantizationConfig
):
class
SlimQuantW4
A8Int8Config
(
QuantizationConfig
):
"""Config class for W8A8 Int8 Quantization.
- Weight: static, per-channel, symmetric
...
...
@@ -60,14 +60,14 @@ class W8A8Int8Config(QuantizationConfig):
@
classmethod
def
get_name
(
self
)
->
str
:
return
"
w8a8_int
8"
return
"
slimquant_w4a
8"
@
classmethod
def
get_config_filenames
(
cls
)
->
List
[
str
]:
return
[]
@
classmethod
def
from_config
(
cls
,
config
:
Dict
[
str
,
Any
])
->
"
W8
A8Int8Config"
:
def
from_config
(
cls
,
config
:
Dict
[
str
,
Any
])
->
"
SlimQuantW4
A8Int8Config"
:
return
cls
()
def
get_quant_method
(
...
...
@@ -77,18 +77,18 @@ class W8A8Int8Config(QuantizationConfig):
)
->
Optional
[
"QuantizeMethodBase"
]:
if
isinstance
(
layer
,
LinearBase
):
return
W8
A8Int8LinearMethod
(
self
)
return
SlimQuantW4
A8Int8LinearMethod
(
self
)
elif
isinstance
(
layer
,
FusedMoE
):
return
W8
A8Int8MoEMethod
(
self
)
return
SlimQuantW4
A8Int8MoEMethod
(
self
)
return
None
def
get_scaled_act_names
(
self
)
->
List
[
str
]:
return
[]
class
W8
A8Int8LinearMethod
(
LinearMethodBase
):
class
SlimQuantW4
A8Int8LinearMethod
(
LinearMethodBase
):
def
__init__
(
self
,
quantization_config
:
W8
A8Int8Config
):
def
__init__
(
self
,
quantization_config
:
SlimQuantW4
A8Int8Config
):
self
.
quantization_config
=
quantization_config
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
...
...
@@ -218,8 +218,8 @@ class W8A8Int8LinearMethod(LinearMethodBase):
bias
=
bias
)
class
W8
A8Int8MoEMethod
:
"""MoE method for INT8.
class
SlimQuantW4
A8Int8MoEMethod
:
"""MoE method for
W4A8
INT8.
Supports loading INT8 checkpoints with static weight scale and
dynamic/static activation scale.
Also supports loading quantized FP16/BF16 model checkpoints with dynamic
...
...
@@ -354,7 +354,7 @@ class W8A8Int8MoEMethod:
from
vllm.model_executor.layers.fused_moe
import
fused_experts
if
enable_eplb
:
raise
NotImplementedError
(
"EPLB not supported for `
W8
A8Int8Mo
e
Method` yet."
)
"EPLB not supported for `
SlimQuantW4
A8Int8Mo
E
Method` yet."
)
# Expert selection
topk_weights
,
topk_ids
=
FusedMoE
.
select_experts
(
hidden_states
=
x
,
...
...
vllm/platforms/rocm.py
View file @
8b1e4ef0
...
...
@@ -180,7 +180,7 @@ class RocmPlatform(Platform):
supported_quantization
:
list
[
str
]
=
[
"awq"
,
"gptq"
,
"fp8"
,
"compressed-tensors"
,
"fbgemm_fp8"
,
"gguf"
,
"quark"
,
"ptpc_fp8"
,
"moe_wna16"
,
"blockwise_int8"
,
"
w8a8_int
8"
,
"awq_marlin"
"quark"
,
"ptpc_fp8"
,
"moe_wna16"
,
"blockwise_int8"
,
"
slimquant_w4a
8"
,
"awq_marlin"
]
@
classmethod
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment