add SUPPORT_MOE_MARLIN_W16A16 to use moe marlin on bw

ad60a973 · zhuwenwen · f4cef40c · ad60a973 · ad60a973
Commit ad60a973 authored Jan 16, 2026 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 0 deletions

vllm/platforms/rocm.py vllm/platforms/rocm.py +7 -0

vllm/utils/__init__.py vllm/utils/__init__.py +4 -0

No files found.
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -16,6 +16,13 @@ from vllm.utils import cuda_device_count_stateless
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
+from vllm.utils import SUPPORT_MOE_MARLIN_W16A16
+if SUPPORT_MOE_MARLIN_W16A16:
+    os.environ['VLLM_USE_MARLIN_W16A16_MOE'] = '1'
+    os.environ['MOE_NN'] = '0'
 if TYPE_CHECKING:
    from vllm.config import ModelConfig, VllmConfig

--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -86,6 +86,10 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
+GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+SUPPORT_MOE_MARLIN_W16A16 = any(arch in GPU_ARCH for arch in ["gfx936"])
 # This value is chosen to have a balance between ITL and TTFT. Note it is
 # not optimized for throughput.
 DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048