remove SUPPORT_MOE_MARLIN_W16A16

6216b12d · zhuwenwen · 1a26d0b6 · 6216b12d · 6216b12d
Commit 6216b12d authored Jan 19, 2026 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 0 additions and 10 deletions

vllm/platforms/rocm.py vllm/platforms/rocm.py +0 -6

vllm/utils/__init__.py vllm/utils/__init__.py +0 -4

No files found.
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -16,12 +16,6 @@ from vllm.utils import cuda_device_count_stateless
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
-from vllm.utils import SUPPORT_MOE_MARLIN_W16A16
-if SUPPORT_MOE_MARLIN_W16A16:
-    os.environ['VLLM_USE_MARLIN_W16A16_MOE'] = '1'
-    os.environ['MOE_NN'] = '0'
 if TYPE_CHECKING:
    from vllm.config import ModelConfig, VllmConfig

--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -86,10 +86,6 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
-GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-SUPPORT_MOE_MARLIN_W16A16 = any(arch in GPU_ARCH for arch in ["gfx936"])
 # This value is chosen to have a balance between ITL and TTFT. Note it is
 # not optimized for throughput.
 DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048