Commit e363e151 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-step3v-int8' into 'v0.9.2-step3v'

add step3v_w8a16

See merge request dcutoolkit/deeplearing/vllm!257
parents 34bf6014 8223f750
This diff is collapsed.
...@@ -33,6 +33,7 @@ QuantizationMethods = Literal[ ...@@ -33,6 +33,7 @@ QuantizationMethods = Literal[
"ipex", "ipex",
"quark", "quark",
"moe_wna16", "moe_wna16",
"groupwise-quant",
"torchao", "torchao",
"auto-round", "auto-round",
"rtn", "rtn",
...@@ -120,6 +121,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: ...@@ -120,6 +121,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
from .blockwise_int8 import BlockInt8Config from .blockwise_int8 import BlockInt8Config
from .slimquant_w4a8 import SlimQuantW4A8Int8Config from .slimquant_w4a8 import SlimQuantW4A8Int8Config
from .slimquant_w4a8_marlin import SlimQuantW4A8Int8MarlinConfig from .slimquant_w4a8_marlin import SlimQuantW4A8Int8MarlinConfig
from .groupwise_quant import GroupwiseQuantConfig
method_to_config: dict[str, type[QuantizationConfig]] = { method_to_config: dict[str, type[QuantizationConfig]] = {
"aqlm": AQLMConfig, "aqlm": AQLMConfig,
...@@ -152,6 +154,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: ...@@ -152,6 +154,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
"auto-round": AutoRoundConfig, "auto-round": AutoRoundConfig,
"rtn": RTNConfig, "rtn": RTNConfig,
"blockwise_int8": BlockInt8Config, "blockwise_int8": BlockInt8Config,
"groupwise-quant": GroupwiseQuantConfig,
"slimquant_w4a8":SlimQuantW4A8Int8Config, "slimquant_w4a8":SlimQuantW4A8Int8Config,
"slimquant_w4a8_marlin":SlimQuantW4A8Int8MarlinConfig, "slimquant_w4a8_marlin":SlimQuantW4A8Int8MarlinConfig,
} }
......
...@@ -16,17 +16,14 @@ from vllm.utils import cuda_device_count_stateless ...@@ -16,17 +16,14 @@ from vllm.utils import cuda_device_count_stateless
from .interface import DeviceCapability, Platform, PlatformEnum, _Backend from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
from vllm.utils import is_kme, SUPPORT_TC from vllm.utils import SUPPORT_TC
if not SUPPORT_TC: if not SUPPORT_TC:
os.environ['VLLM_USE_V1'] = '0' os.environ['VLLM_USE_V1'] = '0'
os.environ['VLLM_USE_FLASH_ATTN_PA'] = '0' os.environ['VLLM_USE_FLASH_ATTN_PA'] = '0'
os.environ['VLLM_USE_FLASH_MLA'] = '0' os.environ['VLLM_USE_FLASH_MLA'] = '0'
if is_kme:
os.environ['VLLM_USE_FLASH_ATTN_PA'] = '0'
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
...@@ -190,7 +187,7 @@ class RocmPlatform(Platform): ...@@ -190,7 +187,7 @@ class RocmPlatform(Platform):
device_control_env_var: str = "CUDA_VISIBLE_DEVICES" device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
supported_quantization: list[str] = [ supported_quantization: list[str] = [
"awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf", "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf","groupwise-quant",
"quark", "ptpc_fp8", "moe_wna16", "blockwise_int8","slimquant_w4a8","awq_marlin","slimquant_w4a8_marlin" "quark", "ptpc_fp8", "moe_wna16", "blockwise_int8","slimquant_w4a8","awq_marlin","slimquant_w4a8_marlin"
] ]
...@@ -304,8 +301,6 @@ class RocmPlatform(Platform): ...@@ -304,8 +301,6 @@ class RocmPlatform(Platform):
logger.info("flash_attn is not supported on NAVI GPUs.") logger.info("flash_attn is not supported on NAVI GPUs.")
else: else:
logger.info("%s is not supported in AMD GPUs.", selected_backend) logger.info("%s is not supported in AMD GPUs.", selected_backend)
if is_kme:
os.environ['VLLM_USE_TRITON_FLASH_ATTN'] = '1'
logger.info("Using ROCmFlashAttention backend.") logger.info("Using ROCmFlashAttention backend.")
return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend" # noqa: E501 return "vllm.attention.backends.rocm_flash_attn.ROCmFlashAttentionBackend" # noqa: E501
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment