update moe configs name

5ca1259e · zhuwenwen · db2c32b0 · 5ca1259e · db2c32b0 · 5ca1259e
Commit 5ca1259e authored Nov 03, 2025 by zhuwenwen
9 changed files
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW3000.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW3000.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200_nn.json
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=K100_AI.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=K100_AI.json
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -50,7 +50,9 @@ logger = init_logger(__name__)
 if envs.VLLM_USE_GLOBAL_CACHE13:
    moe_cache_singleton = None
+arch_name = torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0]
+    arch_cu = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count
 @torch.compile
 def moe_sum_reduce_torch_compile(x, out, routed_scaling_factor):
    torch.sum(x, dim=1, out=out)
@@ -997,14 +999,14 @@ def get_config_file_name(E: int,
                         N: int,
                         dtype: Optional[str],
                         block_shape: Optional[List[int]] = None, use_nn_moe: Optional[bool] = False) -> str:
-    device_name = current_platform.get_device_name().replace(" ", "_")
+    # device_name = current_platform.get_device_name().replace(" ", "_")
    dtype_selector = "" if not dtype else f",dtype={dtype}"
    block_shape_selector = ("" if not block_shape or not all(block_shape) else
                            f",block_shape={block_shape}").replace(" ", "")
    if not use_nn_moe:
-        return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
+        return f"E={E},N={N},device_name={arch_name}_{arch_cu}cu{dtype_selector}{block_shape_selector}.json"
    else:
-        return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}_nn.json"
+        return f"E={E},N={N},device_name={arch_name}_{arch_cu}cu{dtype_selector}{block_shape_selector}_nn.json"
 # Adapted from: https://github.com/sgl-project/sglang/pull/2628
 @functools.lru_cache

--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -84,9 +84,9 @@ DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
 POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
-gpuname = torch.cuda.get_device_properties(torch.cuda.current_device()).name
+GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-is_kme = gpuname.startswith('K100_AI') or gpuname.startswith('K500SM_AI')
+is_kme = any(arch in GPU_ARCH for arch in ["gfx928"])
-SUPPORT_TC = gpuname.startswith('K100_AI') or gpuname.startswith('K500SM_AI') or gpuname.startswith('BW')
+SUPPORT_TC = any(arch in GPU_ARCH for arch in ["gfx928", "gfx936"])
 def _generate_random_int8(
    tensor: torch.Tensor,
@@ -1956,9 +1956,10 @@ class W8a8GetCacheJSON:
        self.triton_json_list=[]
        self.weight_shapes=[]
        self.moe_weight_shapes=[]
-        device_name = current_platform.get_device_name().replace(" ", "_")
+        arch_name = torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0]
-        if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
+        arch_cu = torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count
-            device_name='K100_AI_120'
+        device_name =arch_name+'_'+str(arch_cu)+'cu'
        self.device_name=device_name
        self.topk=1
        self.quant_method=None