The gfx928 architecture forces the use of the Triton gemm.

f551bd1d · wanglong3 · a27f634a · f551bd1d · f551bd1d
Commit f551bd1d authored Feb 10, 2026 by wanglong3
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 3 deletions

vllm/envs.py vllm/envs.py +4 -1

vllm/model_executor/layers/quantization/slimquant_w4a8.py vllm/model_executor/layers/quantization/slimquant_w4a8.py +2 -2

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -6,6 +6,7 @@ import json
 import os
 import sys
 import tempfile
+import torch
 from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union

 if TYPE_CHECKING:
@@ -1704,7 +1705,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # cutlass: 2 (will remove in the future)
    # blaslt: 3 (default)
    # rocblas: others
-    "VLLM_W8A8_BACKEND": lambda: int(os.getenv("VLLM_W8A8_BACKEND", "3")),
+    "VLLM_W8A8_BACKEND": lambda: int(
+        1 if "gfx928" in torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] else os.getenv("VLLM_W8A8_BACKEND", "3")),
+
    # Force using Triton MoE path (disable Marlin W16A16 MoE).
    "VLLM_USE_MOE_W16A16_TRITON":
        lambda: (os.environ.get("VLLM_USE_MOE_W16A16_TRITON", "0").lower() in

--- a/vllm/model_executor/layers/quantization/slimquant_w4a8.py
+++ b/vllm/model_executor/layers/quantization/slimquant_w4a8.py
@@ -92,8 +92,8 @@ class SlimQuantW4A8Int8LinearMethod(LinearMethodBase):

    def __init__(self, quantization_config: SlimQuantW4A8Int8Config):
        self.quantization_config = quantization_config
-        self.tritonsingleton= W8a8GetCacheJSON()
-        self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1'))
+        self.tritonsingleton = W8a8GetCacheJSON()
+        self.w8a8_strategy = envs.VLLM_W8A8_BACKEND

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        n=layer.weight.shape[0]