Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8223f750
Commit
8223f750
authored
Nov 16, 2025
by
luopl
Browse files
feat: implement int8 quantization
parent
34bf6014
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
2438 additions
and
387 deletions
+2438
-387
vllm/model_executor/layers/fused_moe/fused_moe_step3vw8a16.py
.../model_executor/layers/fused_moe/fused_moe_step3vw8a16.py
+2327
-0
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/__init__.py
+3
-0
vllm/model_executor/layers/quantization/groupwise_quant.py
vllm/model_executor/layers/quantization/groupwise_quant.py
+105
-379
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+3
-8
No files found.
vllm/model_executor/layers/fused_moe/fused_moe_step3vw8a16.py
0 → 100644
View file @
8223f750
This diff is collapsed.
Click to expand it.
vllm/model_executor/layers/quantization/__init__.py
View file @
8223f750
...
@@ -33,6 +33,7 @@ QuantizationMethods = Literal[
...
@@ -33,6 +33,7 @@ QuantizationMethods = Literal[
"ipex"
,
"ipex"
,
"quark"
,
"quark"
,
"moe_wna16"
,
"moe_wna16"
,
"groupwise-quant"
,
"torchao"
,
"torchao"
,
"auto-round"
,
"auto-round"
,
"rtn"
,
"rtn"
,
...
@@ -120,6 +121,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
...
@@ -120,6 +121,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
from
.blockwise_int8
import
BlockInt8Config
from
.blockwise_int8
import
BlockInt8Config
from
.slimquant_w4a8
import
SlimQuantW4A8Int8Config
from
.slimquant_w4a8
import
SlimQuantW4A8Int8Config
from
.slimquant_w4a8_marlin
import
SlimQuantW4A8Int8MarlinConfig
from
.slimquant_w4a8_marlin
import
SlimQuantW4A8Int8MarlinConfig
from
.groupwise_quant
import
GroupwiseQuantConfig
method_to_config
:
dict
[
str
,
type
[
QuantizationConfig
]]
=
{
method_to_config
:
dict
[
str
,
type
[
QuantizationConfig
]]
=
{
"aqlm"
:
AQLMConfig
,
"aqlm"
:
AQLMConfig
,
...
@@ -152,6 +154,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
...
@@ -152,6 +154,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
"auto-round"
:
AutoRoundConfig
,
"auto-round"
:
AutoRoundConfig
,
"rtn"
:
RTNConfig
,
"rtn"
:
RTNConfig
,
"blockwise_int8"
:
BlockInt8Config
,
"blockwise_int8"
:
BlockInt8Config
,
"groupwise-quant"
:
GroupwiseQuantConfig
,
"slimquant_w4a8"
:
SlimQuantW4A8Int8Config
,
"slimquant_w4a8"
:
SlimQuantW4A8Int8Config
,
"slimquant_w4a8_marlin"
:
SlimQuantW4A8Int8MarlinConfig
,
"slimquant_w4a8_marlin"
:
SlimQuantW4A8Int8MarlinConfig
,
}
}
...
...
vllm/model_executor/layers/quantization/groupwise_quant.py
View file @
8223f750
This diff is collapsed.
Click to expand it.
vllm/platforms/rocm.py
View file @
8223f750
...
@@ -16,17 +16,14 @@ from vllm.utils import cuda_device_count_stateless
...
@@ -16,17 +16,14 @@ from vllm.utils import cuda_device_count_stateless
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
,
_Backend
from
.interface
import
DeviceCapability
,
Platform
,
PlatformEnum
,
_Backend
from
vllm.utils
import
is_kme
,
SUPPORT_TC
from
vllm.utils
import
SUPPORT_TC
if
not
SUPPORT_TC
:
if
not
SUPPORT_TC
:
os
.
environ
[
'VLLM_USE_V1'
]
=
'0'
os
.
environ
[
'VLLM_USE_V1'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_ATTN_PA'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_ATTN_PA'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_MLA'
]
=
'0'
os
.
environ
[
'VLLM_USE_FLASH_MLA'
]
=
'0'
if
is_kme
:
os
.
environ
[
'VLLM_USE_FLASH_ATTN_PA'
]
=
'0'
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
,
VllmConfig
from
vllm.config
import
ModelConfig
,
VllmConfig
...
@@ -190,7 +187,7 @@ class RocmPlatform(Platform):
...
@@ -190,7 +187,7 @@ class RocmPlatform(Platform):
device_control_env_var
:
str
=
"CUDA_VISIBLE_DEVICES"
device_control_env_var
:
str
=
"CUDA_VISIBLE_DEVICES"
supported_quantization
:
list
[
str
]
=
[
supported_quantization
:
list
[
str
]
=
[
"awq"
,
"gptq"
,
"fp8"
,
"compressed-tensors"
,
"fbgemm_fp8"
,
"gguf"
,
"awq"
,
"gptq"
,
"fp8"
,
"compressed-tensors"
,
"fbgemm_fp8"
,
"gguf"
,
"groupwise-quant"
,
"quark"
,
"ptpc_fp8"
,
"moe_wna16"
,
"blockwise_int8"
,
"slimquant_w4a8"
,
"awq_marlin"
,
"slimquant_w4a8_marlin"
"quark"
,
"ptpc_fp8"
,
"moe_wna16"
,
"blockwise_int8"
,
"slimquant_w4a8"
,
"awq_marlin"
,
"slimquant_w4a8_marlin"
]
]
...
@@ -304,8 +301,6 @@ class RocmPlatform(Platform):
...
@@ -304,8 +301,6 @@ class RocmPlatform(Platform):
logger
.
info
(
"flash_attn is not supported on NAVI GPUs."
)
logger
.
info
(
"flash_attn is not supported on NAVI GPUs."
)
else
:
else
:
logger
.
info
(
"%s is not supported in AMD GPUs."
,
selected_backend
)
logger
.
info
(
"%s is not supported in AMD GPUs."
,
selected_backend
)
if
is_kme
:
os
.
environ
[
'VLLM_USE_TRITON_FLASH_ATTN'
]
=
'1'
logger
.
info
(
"Using ROCmFlashAttention backend."
)
logger
.
info
(
"Using ROCmFlashAttention backend."
)
return
"vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"
# noqa: E501
return
"vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend"
# noqa: E501
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment