Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
54139f16
Commit
54139f16
authored
Aug 09, 2025
by
zhuwenwen
Browse files
修改增加lmslimquant_w4a8量化支持
parent
bdda5719
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
23 additions
and
17 deletions
+23
-17
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+4
-3
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/__init__.py
+3
-3
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
...quantization/compressed_tensors/compressed_tensors_moe.py
+5
-0
vllm/model_executor/layers/quantization/slimquant_w4a8.py
vllm/model_executor/layers/quantization/slimquant_w4a8.py
+10
-10
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+1
-1
No files found.
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
View file @
54139f16
...
...
@@ -36,9 +36,10 @@ class ActivationMethod(IntEnum):
@
cache
def
is_rocm_aiter_moe_enabled
()
->
bool
:
return
current_platform
.
is_rocm
()
\
and
envs
.
VLLM_ROCM_USE_AITER_MOE
\
and
envs
.
VLLM_ROCM_USE_AITER
return
False
# return current_platform.is_rocm() \
# and envs.VLLM_ROCM_USE_AITER_MOE \
# and envs.VLLM_ROCM_USE_AITER
def
rocm_aiter_asm_moe_tkw1_impl
(
...
...
vllm/model_executor/layers/quantization/__init__.py
View file @
54139f16
...
...
@@ -38,7 +38,7 @@ QuantizationMethods = Literal[
"rtn"
,
"inc"
,
"blockwise_int8"
,
"
w8a8_int
8"
,
"
slimquant_w4a
8"
,
]
QUANTIZATION_METHODS
:
list
[
str
]
=
list
(
get_args
(
QuantizationMethods
))
...
...
@@ -119,7 +119,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
from
.torchao
import
TorchAOConfig
from
.tpu_int8
import
Int8TpuConfig
from
.blockwise_int8
import
BlockInt8Config
from
.
w8a8_int8
import
W8
A8Int8Config
from
.
slimquant_w4a8
import
SlimQuantW4
A8Int8Config
method_to_config
:
dict
[
str
,
type
[
QuantizationConfig
]]
=
{
"aqlm"
:
AQLMConfig
,
...
...
@@ -153,7 +153,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
"rtn"
:
RTNConfig
,
"inc"
:
INCConfig
,
"blockwise_int8"
:
BlockInt8Config
,
"
w8a8_int8"
:
W8
A8Int8Config
,
"
slimquant_w4a8"
:
SlimQuantW4
A8Int8Config
,
}
# Update the `method_to_config` with customized quantization methods.
method_to_config
.
update
(
_CUSTOMIZED_METHOD_TO_QUANT_CONFIG
)
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
View file @
54139f16
...
...
@@ -652,6 +652,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
apply_router_weight_on_input
:
bool
=
False
,
activation
:
str
=
"silu"
,
enable_eplb
:
bool
=
False
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
use_fused_gate
:
Optional
[
bool
]
=
False
,
expert_load_view
:
Optional
[
torch
.
Tensor
]
=
None
,
logical_to_physical_map
:
Optional
[
torch
.
Tensor
]
=
None
,
logical_replica_count
:
Optional
[
torch
.
Tensor
]
=
None
,
...
...
@@ -671,6 +674,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
num_expert_group
=
num_expert_group
,
custom_routing_function
=
custom_routing_function
,
scoring_func
=
scoring_func
,
routed_scaling_factor
=
routed_scaling_factor
,
use_fused_gate
=
use_fused_gate
,
e_score_correction_bias
=
e_score_correction_bias
,
indices_type
=
self
.
topk_indices_dtype
,
)
...
...
vllm/model_executor/layers/quantization/
w8a8_int
8.py
→
vllm/model_executor/layers/quantization/
slimquant_w4a
8.py
100755 → 100644
View file @
54139f16
...
...
@@ -40,7 +40,7 @@ def baseline_scaled_mm(a: torch.Tensor,
return
output
.
to
(
out_dtype
)
class
W8
A8Int8Config
(
QuantizationConfig
):
class
SlimQuantW4
A8Int8Config
(
QuantizationConfig
):
"""Config class for W8A8 Int8 Quantization.
- Weight: static, per-channel, symmetric
...
...
@@ -60,14 +60,14 @@ class W8A8Int8Config(QuantizationConfig):
@
classmethod
def
get_name
(
self
)
->
str
:
return
"
w8a8_int
8"
return
"
slimquant_w4a
8"
@
classmethod
def
get_config_filenames
(
cls
)
->
List
[
str
]:
return
[]
@
classmethod
def
from_config
(
cls
,
config
:
Dict
[
str
,
Any
])
->
"
W8
A8Int8Config"
:
def
from_config
(
cls
,
config
:
Dict
[
str
,
Any
])
->
"
SlimQuantW4
A8Int8Config"
:
return
cls
()
def
get_quant_method
(
...
...
@@ -77,18 +77,18 @@ class W8A8Int8Config(QuantizationConfig):
)
->
Optional
[
"QuantizeMethodBase"
]:
if
isinstance
(
layer
,
LinearBase
):
return
W8
A8Int8LinearMethod
(
self
)
return
SlimQuantW4
A8Int8LinearMethod
(
self
)
elif
isinstance
(
layer
,
FusedMoE
):
return
W8
A8Int8MoEMethod
(
self
)
return
SlimQuantW4
A8Int8MoEMethod
(
self
)
return
None
def
get_scaled_act_names
(
self
)
->
List
[
str
]:
return
[]
class
W8
A8Int8LinearMethod
(
LinearMethodBase
):
class
SlimQuantW4
A8Int8LinearMethod
(
LinearMethodBase
):
def
__init__
(
self
,
quantization_config
:
W8
A8Int8Config
):
def
__init__
(
self
,
quantization_config
:
SlimQuantW4
A8Int8Config
):
self
.
quantization_config
=
quantization_config
self
.
tritonsingleton
=
W8a8GetCacheJSON
()
self
.
w8a8_strategy
=
int
(
os
.
getenv
(
'W8A8_SUPPORT_METHODS'
,
'1'
))
...
...
@@ -218,8 +218,8 @@ class W8A8Int8LinearMethod(LinearMethodBase):
bias
=
bias
)
class
W8
A8Int8MoEMethod
:
"""MoE method for INT8.
class
SlimQuantW4
A8Int8MoEMethod
:
"""MoE method for
W4A8
INT8.
Supports loading INT8 checkpoints with static weight scale and
dynamic/static activation scale.
Also supports loading quantized FP16/BF16 model checkpoints with dynamic
...
...
@@ -355,7 +355,7 @@ class W8A8Int8MoEMethod:
if
enable_eplb
:
raise
NotImplementedError
(
"EPLB not supported for `
W8
A8Int8Mo
e
Method` yet."
)
"EPLB not supported for `
SlimQuantW4
A8Int8Mo
E
Method` yet."
)
# Expert selection
topk_weights
,
topk_ids
=
FusedMoE
.
select_experts
(
hidden_states
=
x
,
...
...
vllm/platforms/rocm.py
View file @
54139f16
...
...
@@ -181,7 +181,7 @@ class RocmPlatform(Platform):
supported_quantization
:
list
[
str
]
=
[
"awq"
,
"gptq"
,
"fp8"
,
"compressed-tensors"
,
"fbgemm_fp8"
,
"gguf"
,
"quark"
,
"ptpc_fp8"
,
"moe_wna16"
,
"
blockwise_int
8"
,
"w8a8_int8"
,
"awq_marlin"
"quark"
,
"ptpc_fp8"
,
"moe_wna16"
,
"
slimquant_w4a
8"
,
"w8a8_int8"
,
"awq_marlin"
]
@
classmethod
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment