update moe configs name

28021d6e · zhuwenwen · 653b799b · 653b799b · 28021d6e · 28021d6e
Commit 28021d6e authored Oct 31, 2025 by zhuwenwen
8 changed files
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW3000.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW3000.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=K100_AI.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=K100_AI.json
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -59,6 +59,9 @@ logger = init_logger(__name__)
 if envs.VLLM_USE_GLOBAL_CACHE13:
    moe_cache_singleton = None
    
+arch_name = torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0]
+arch_cu = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count
+

 @torch.compile
 def moe_sum_reduce_torch_compile(x, out, routed_scaling_factor):
@@ -1091,14 +1094,14 @@ def get_config_file_name(E: int,
                         N: int,
                         dtype: Optional[str],
                         block_shape: Optional[List[int]] = None, use_nn_moe: Optional[bool] = False) -> str:
-    device_name = current_platform.get_device_name().replace(" ", "_")
+    # device_name = current_platform.get_device_name().replace(" ", "_")
    dtype_selector = "" if not dtype else f",dtype={dtype}"
    block_shape_selector = ("" if not block_shape or not all(block_shape) else
                            f",block_shape={block_shape}").replace(" ", "")
    if not use_nn_moe:
-        return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
+        return f"E={E},N={N},device_name={arch_name}_{arch_cu}cu{dtype_selector}{block_shape_selector}.json"
    else:
-        return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}_nn.json"
+        return f"E={E},N={N},device_name={arch_name}_{arch_cu}cu{dtype_selector}{block_shape_selector}_nn.json"

 # Adapted from: https://github.com/sgl-project/sglang/pull/2628
 @functools.lru_cache

--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -92,8 +92,8 @@ DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
 POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120

-gpuname = torch.cuda.get_device_properties(torch.cuda.current_device()).name
-SUPPORT_TC = gpuname.startswith('K100_AI') or gpuname.startswith('K500SM_AI') or gpuname.startswith('BW')
+GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+SUPPORT_TC = any(arch in GPU_ARCH for arch in ["gfx928", "gfx936"])

 # Constants related to forcing the attention backend selection