sync v0.15.1

c721b814 · zhuwenwen · d53fe7e5 · d53fe7e5 · d53fe7e5 · d53fe7e5
Commit c721b814 authored Feb 05, 2026 by zhuwenwen
20 changed files
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=BW3000.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=BW3000.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=DCU_K100_AI_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "2": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "4": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "48": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 3
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=K100_AI_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW200_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW3000.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=BW3000.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=K100_AI.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "2": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "4": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=K100_AI.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=K100_AI.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4
-    },
-    "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -104,13 +104,7 @@ def run_cutlass_moe_fp8(
    ), "Intermediate scale shape mismatch"
    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
-    # NOTE(rob): the expert_map is used for the STANDARD case and
+    if expert_map is not None:
-    # the batched format is used by the BATCHED case.
-    # TODO(rob): update the MK interface to only pass the expert_map
-    # during the STANDARD case to make this clearer across all kernels.
-    if use_batched_format:
-        assert expert_num_tokens is not None
-    else:
        assert expert_num_tokens is None
    # We have two modes: batched experts and non-batched experts.
@@ -386,10 +380,7 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
        # needed for STANDARD activation format kernels in DP/EP mode.
        # Note that the BATCHED activation format does not use
        # the expert map for identifying experts.
-        return not (
+        return not moe_parallel_config.use_all2all_kernels
-            moe_parallel_config.use_fi_all2allv_kernels
-            or moe_parallel_config.use_deepep_ht_kernels
-        )
    def supports_chunking(self) -> bool:
        return True
@@ -651,8 +642,10 @@ def run_cutlass_moe_fp4(
 class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
-    @property
+    @staticmethod
-    def expects_unquantized_inputs(self) -> bool:
+    def expects_unquantized_inputs(
+        moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig
+    ) -> bool:
        return True
    @staticmethod

--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -148,8 +148,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
-        # NOTE(rob): discovered an IMA with this combination. Needs investigation.
+        return True
-        return not moe_parallel_config.use_fi_all2allv_kernels
    def supports_chunking(self) -> bool:
        return True

--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -103,7 +103,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        num_experts: int,
        a1_scale: torch.Tensor | None,
        quant_config: FusedMoEQuantConfig,
-        defer_input_quant: bool,
    ) -> Callable:
        has_scales = token_scales is not None
@@ -175,7 +174,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
            expert_topk_weights,
            a1_scale,
            quant_config,
-            defer_input_quant=defer_input_quant,
        )
    def _receiver(
@@ -189,7 +187,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        expert_topk_weights: torch.Tensor | None,
        a1_scale: torch.Tensor | None,
        quant_config: FusedMoEQuantConfig,
-        defer_input_quant: bool,
    ) -> mk.PrepareResultType:
        if event.event is not None:
            event.current_stream_wait()
@@ -224,15 +221,14 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
            expert_num_tokens_per_expert_list, device=expert_x.device
        )
-        # * For non-block quant, dispatch in b16 and quantize now as
+        # Dispatch and Quant
-        #   DeepEP kernels only support dispatching block scales.
+        # DeepEP kernels only support dispatching block-quantized
-        # * For expert kernels that require unquantized inputs,
+        # activation scales.
-        #   defer quantization to FusedMoEExpertsPermuteUnpermute.
+        # Dispatch in bfloat16 and quantize afterwards
-        if not quant_config.is_block_quantized and not defer_input_quant:
+        if not quant_config.is_block_quantized:
            # Quantize after dispatch.
            expert_x_scale = None
            if expert_x.numel() != 0:
-                # TODO: support per_act_token_quant,
                expert_x, expert_x_scale = moe_kernel_quantize_input(
                    expert_x,
                    a1_scale,
@@ -261,7 +257,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
-        defer_input_quant: bool = False,
    ) -> mk.ReceiverType:
        if apply_router_weight_on_input:
            topk = topk_ids.size(1)
@@ -271,12 +266,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
            )
            a1 = a1 * topk_weights.to(a1.dtype)
-        # * DeepEP only supports fp8 block scales so quantize
+        if quant_config.is_block_quantized:
-        #   before the dispatch for these models.
+            # Quant and Dispatch
-        # * For all other quantization, dispatch after.
-        # * For expert kernels that require unquantized inputs,
-        #   defer quantization to FusedMoEExpertsPermuteUnpermute.
-        if quant_config.is_block_quantized and not defer_input_quant:
            a1q, a1q_scale = moe_kernel_quantize_input(
                a1,
                quant_config.a1_scale,
@@ -290,11 +281,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        else:
            a1q = a1
            a1q_scale = None
-            a1_post_scale = (
+            a1_post_scale = quant_config.a1_scale
-                quant_config.a1_gscale
-                if quant_config.quant_dtype == "nvfp4"
-                else quant_config.a1_scale
-            )
        return self._do_dispatch(
            tokens=a1q,
@@ -304,7 +291,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
            num_experts=num_experts,
            a1_scale=a1_post_scale,
            quant_config=quant_config,
-            defer_input_quant=defer_input_quant,
        )
    def prepare(
@@ -316,7 +302,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
-        defer_input_quant: bool = False,
    ) -> mk.PrepareResultType:
        receiver = self.prepare_async(
            a1,
@@ -326,7 +311,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
            expert_map,
            apply_router_weight_on_input,
            quant_config,
-            defer_input_quant,
        )
        return receiver()

--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -242,13 +242,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
-        defer_input_quant: bool = False,
    ) -> tuple[Callable, mk.ReceiverType]:
-        if defer_input_quant:
-            raise NotImplementedError(
-                f"{self.__class__.__name__} does not support defer_input_quant=True. "
-                "Please select an MoE kernel that accepts quantized inputs."
-            )
        hidden_size = a1.size(1)
        assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, (
@@ -351,13 +345,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
-        defer_input_quant: bool = False,
    ) -> mk.PrepareResultType:
-        if defer_input_quant:
-            raise NotImplementedError(
-                f"{self.__class__.__name__} does not support defer_input_quant=True. "
-                "Please select an MoE kernel that accepts quantized inputs."
-            )
        hook, receiver = self.prepare_async(
            a1,
            topk_weights,

--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -78,9 +78,16 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
        # - skip input activation quantization (kernel applies scaling)
        self.use_deepseek_fp8_block_scale = quant_config.is_block_quantized
-    @property
+    @staticmethod
-    def expects_unquantized_inputs(self) -> bool:
+    def expects_unquantized_inputs(
-        return self.quant_config.use_fp8_w8a8 and self.quant_config.is_block_quantized
+        moe_config: mk.FusedMoEConfig, quant_config: FusedMoEQuantConfig
+    ) -> bool:
+        # NVFP4 TP kernels and FP8 block-quantized kernels apply
+        # input quantization inside FusedMoEPermuteExpertsUnpermute.
+        return (
+            quant_config.use_nvfp4_w4a4
+            and not moe_config.moe_parallel_config.use_all2all_kernels
+        ) or (quant_config.use_fp8_w8a8 and quant_config.is_block_quantized)
    @staticmethod
    def _supports_current_device() -> bool:
@@ -138,8 +145,10 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
        # FLASHINFER_CUTLASS currently uses its down P/F, which does not
        # work with SP. This will be removed in follow up after we get
        # rid of the FlashInfer specific P/F function.
-        # TODO: the per-tensor fp8 kernels don't work with MNNVL FI A2As.
+        return (
-        return not moe_parallel_config.is_sequence_parallel
+            moe_parallel_config.dp_size == 1
+            or moe_parallel_config.dp_size == moe_parallel_config.ep_size
+        )
    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
@@ -186,9 +195,9 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
        """
        workspace1 = (M, K)
        workspace2 = (0,)
-        # For NVFP4, the output is stored in a packed int8 format,
-        # so the actual hidden dim is 2x the size of K here.
+        # For TP, the quantization is fused with fused_moe call.
-        output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" else K)
+        output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" and self.use_dp else K)
        # The workspace is determined by `aq`, since it comes after any
        # potential communication op and is involved in the expert computation.
        return (workspace1, workspace2, output_shape)

--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.distributed import get_dp_group, get_ep_group
-from vllm.distributed.device_communicators.base_device_communicator import (
-    All2AllManagerBase,
-)
-from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
-from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceNoOP,
-)
-from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
-from vllm.utils.flashinfer import nvfp4_block_scale_interleave
-def get_local_sizes():
-    return get_forward_context().dp_metadata.get_chunk_sizes_across_dp_rank()
-class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
-    """Base class for FlashInfer MoE prepare and finalize operations."""
-    def __init__(
-        self,
-        use_dp: bool,
-        num_dispatchers: int = 1,
-        use_deepseek_fp8_block_scale: bool = False,
-    ):
-        super().__init__()
-        self.num_dispatchers_ = num_dispatchers
-        self.use_dp = use_dp
-        self.local_tokens = None
-        # Toggle for DeepSeek-style FP8 block-scale path where activations are
-        # not quantized here and weight block scales are consumed by the kernel.
-        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
-    @property
-    def activation_format(self) -> mk.FusedMoEActivationFormat:
-        return mk.FusedMoEActivationFormat.Standard
-    def max_num_tokens_per_rank(self) -> int | None:
-        return None
-    def topk_indices_dtype(self) -> torch.dtype | None:
-        return None
-    def num_dispatchers(self) -> int:
-        return self.num_dispatchers_
-    def output_is_reduced(self) -> bool:
-        return False
-    def _apply_router_weight_on_input(
-        self,
-        a1: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        apply_router_weight_on_input: bool,
-    ) -> None:
-        """Apply router weight on input if needed."""
-        if apply_router_weight_on_input:
-            topk = topk_ids.size(1)
-            assert topk == 1, (
-                "apply_router_weight_on_input is only implemented for topk=1"
-            )
-            a1.mul_(topk_weights.to(a1.dtype))
-class FlashInferAllToAllMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
-    """FlashInfer implementation using AllToAll communication."""
-    def __init__(
-        self,
-        use_dp: bool,
-        num_dispatchers: int = 1,
-        use_deepseek_fp8_block_scale: bool = False,
-    ):
-        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
-        self.alltoall_info = None
-        # Initialize all2all_manager only for DP case
-        self.all2all_manager = None
-        if self.use_dp:
-            self.all2all_manager = get_ep_group().device_communicator.all2all_manager
-    def prepare(
-        self,
-        a1: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_experts: int,
-        expert_map: torch.Tensor | None,
-        apply_router_weight_on_input: bool,
-        quant_config: FusedMoEQuantConfig,
-    ) -> mk.PrepareResultType:
-        self._apply_router_weight_on_input(
-            a1, topk_weights, topk_ids, apply_router_weight_on_input
-        )
-        if not self.use_dp:
-            # Non-DP case: quantize activations unless using block-scale path
-            if not self.use_deepseek_fp8_block_scale:
-                a1q, a1q_scale = moe_kernel_quantize_input(
-                    a1,
-                    quant_config.a1_gscale,
-                    quant_config.quant_dtype,
-                    quant_config.per_act_token_quant,
-                    quant_config.block_shape,
-                    is_fp4_scale_swizzled=not self.use_dp,
-                )
-            else:
-                a1q = a1
-                a1q_scale = None
-        else:
-            # DP case: use FlashInfer AllToAll
-            global_num_tokens_cpu = get_local_sizes()
-            top_k = topk_ids.size(1)
-            (self.alltoall_info, topk_ids, topk_weights, a1q, a1q_scale) = (
-                flashinfer_alltoall_dispatch(
-                    self.all2all_manager,
-                    global_num_tokens_cpu,
-                    a1,
-                    quant_config.a1_gscale,
-                    topk_ids,
-                    topk_weights,
-                    top_k,
-                    num_experts,
-                    quant_config,
-                    use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
-                )
-            )
-        return a1q, a1q_scale, None, topk_ids, topk_weights
-    def finalize(
-        self,
-        output: torch.Tensor,
-        fused_expert_output: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        apply_router_weight_on_input: bool,
-        weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> None:
-        if self.use_dp:
-            top_k = topk_ids.size(1)
-            token_count = output.shape[0]
-            fused_expert_output = flashinfer_alltoall_combine(
-                self.all2all_manager,
-                fused_expert_output,
-                top_k=top_k,
-                token_count=token_count,
-                alltoall_info=self.alltoall_info,
-            )
-        output.copy_(fused_expert_output)
-class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
-    def __init__(
-        self,
-        use_dp: bool,
-        num_dispatchers: int = 1,
-        use_deepseek_fp8_block_scale: bool = False,
-    ):
-        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
-    def prepare(
-        self,
-        a1: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_experts: int,
-        expert_map: torch.Tensor | None,
-        apply_router_weight_on_input: bool,
-        quant_config: FusedMoEQuantConfig,
-    ) -> mk.PrepareResultType:
-        self._apply_router_weight_on_input(
-            a1, topk_weights, topk_ids, apply_router_weight_on_input
-        )
-        is_nvfp4 = quant_config.quant_dtype == "nvfp4"
-        if not self.use_dp and is_nvfp4:
-            return a1, None, None, topk_ids, topk_weights
-        if not self.use_deepseek_fp8_block_scale:
-            a1q, a1q_scale = moe_kernel_quantize_input(
-                a1,
-                quant_config.a1_gscale if is_nvfp4 else quant_config.a1_scale,
-                quant_config.quant_dtype,
-                quant_config.per_act_token_quant,
-                quant_config.block_shape,
-                is_fp4_scale_swizzled=not self.use_dp,
-            )
-        else:
-            # Block-scale path: pass activations through, omit per-token scales
-            a1q = a1
-            a1q_scale = None
-        if self.use_dp:
-            # Build gather list conditionally - omit a1q_scale if None
-            # (block-scale path)
-            gather_list = [topk_weights, topk_ids, a1q]
-            if a1q_scale is not None:
-                gather_list.append(a1q_scale)
-                gathered = get_dp_group().all_gatherv(
-                    gather_list,
-                    dim=0,
-                    sizes=get_local_sizes(),
-                )
-                topk_weights, topk_ids, a1q, a1q_scale = gathered
-            else:
-                gathered = get_dp_group().all_gatherv(
-                    gather_list,
-                    dim=0,
-                    sizes=get_local_sizes(),
-                )
-                topk_weights, topk_ids, a1q = gathered
-                a1q_scale = None
-        if is_nvfp4 and a1q_scale is not None:
-            if a1q_scale.element_size() == 1:
-                a1q_scale = a1q_scale.view(torch.uint8)
-            a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
-        return a1q, a1q_scale, None, topk_ids, topk_weights
-    def finalize(
-        self,
-        output: torch.Tensor,
-        fused_expert_output: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        apply_router_weight_on_input: bool,
-        weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> None:
-        assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceNoOP)
-        if self.use_dp:
-            fused_expert_output = get_dp_group().reduce_scatterv(
-                fused_expert_output, dim=0, sizes=get_local_sizes()
-            )
-        output.copy_(fused_expert_output)
-def flashinfer_alltoall_dispatch(
-    all2all_manager: All2AllManagerBase,
-    global_num_tokens_cpu: list[int],
-    x: torch.Tensor,
-    gs: torch.Tensor,
-    topk_ids: torch.Tensor,
-    topk_weights: torch.Tensor,
-    top_k: int,
-    num_experts: int,
-    quant_config: FusedMoEQuantConfig,
-    use_deepseek_fp8_block_scale: bool = False,
-):
-    from flashinfer.comm.trtllm_alltoall import MnnvlMoe
-    assert all2all_manager.ensure_alltoall_workspace_initialized(), (
-        "FlashInfer AllToAll workspace not available"
-    )
-    ep_rank = all2all_manager.rank
-    ep_size = all2all_manager.world_size
-    max_num_token = (
-        max(global_num_tokens_cpu) if global_num_tokens_cpu is not None else x.shape[0]
-    )
-    orig_topk_weights_dtype = topk_weights.dtype
-    alltoall_info, topk_ids, topk_weights, _ = (
-        MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgather(
-            topk_ids,
-            topk_weights,
-            None,
-            all2all_manager.prepare_workspace_tensor,
-            max_num_token,
-            ep_rank,
-            ep_size,
-            num_experts,
-            num_experts,
-            top_k,
-        )
-    )
-    topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype)
-    if not use_deepseek_fp8_block_scale:
-        x, x_sf = moe_kernel_quantize_input(
-            x,
-            gs,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-            is_fp4_scale_swizzled=False,  # delay swizzle to after comm
-        )
-        x = MnnvlMoe.mnnvl_moe_alltoallv(
-            x,
-            alltoall_info,
-            all2all_manager.workspace_tensor,
-            ep_rank,
-            ep_size,
-        )
-        x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
-            x_sf,
-            alltoall_info,
-            all2all_manager.workspace_tensor,
-            ep_rank,
-            ep_size,
-        )
-        if quant_config.quant_dtype == "nvfp4":
-            x_sf = nvfp4_block_scale_interleave(x_sf)
-    else:
-        # Block-scale path: pass activations through without quantization
-        x_sf = None
-        x = MnnvlMoe.mnnvl_moe_alltoallv(
-            x,
-            alltoall_info,
-            all2all_manager.workspace_tensor,
-            ep_rank,
-            ep_size,
-        )
-    return alltoall_info, topk_ids, topk_weights, x, x_sf
-def flashinfer_alltoall_combine(
-    all2all_manager: All2AllManagerBase,
-    output: torch.Tensor,
-    top_k: int,
-    token_count: int,
-    alltoall_info,
-):
-    from flashinfer.comm.trtllm_alltoall import MnnvlMoe
-    assert all2all_manager.ensure_alltoall_workspace_initialized(), (
-        "FlashInfer AllToAll workspace not available"
-    )
-    return MnnvlMoe.mnnvl_moe_alltoallv_combine(
-        output,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank=all2all_manager.rank,
-        ep_size=all2all_manager.world_size,
-        top_k=top_k,
-        token_count=token_count,
-    )
-def create_flashinfer_prepare_finalize(
-    use_dp: bool,
-    use_nvfp4: bool = False,
-    enable_alltoallv: bool = False,
-    use_deepseek_fp8_block_scale: bool = False,
-) -> FlashInferCutlassMoEPrepareAndFinalize | MoEPrepareAndFinalizeNoEP:
-    """Factory function to create the appropriate FlashInfer implementation."""
-    if use_dp:
-        if enable_alltoallv:
-            assert use_nvfp4
-            return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
-        return FlashInferAllGatherMoEPrepareAndFinalize(
-            use_dp=True,
-            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-        )
-    else:
-        # CUTLASS FP8 BLOCK and CUTLASS NVFP4 apply input quantization
-        # in a single call with the MoE experts kernel.
-        defer_input_quant = use_deepseek_fp8_block_scale or use_nvfp4
-        return MoEPrepareAndFinalizeNoEP(defer_input_quant=defer_input_quant)
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -533,13 +533,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
-        defer_input_quant: bool = False,
    ) -> mk.PrepareResultType:
-        if defer_input_quant:
-            raise NotImplementedError(
-                f"{self.__class__.__name__} does not support defer_input_quant=True. "
-                "Please select an MoE kernel that accepts quantized inputs."
-            )
        assert a1.dim() == 2
        assert topk_ids.dim() == 2
        assert topk_ids.size(0) == a1.size(0)

--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -593,7 +593,7 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute):
    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
-        return not moe_parallel_config.use_fi_all2allv_kernels
+        return True
    @property
    def quant_type_id(self) -> int:

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -399,27 +399,14 @@ def fused_moe_kernel(
    # Map program ids `pid` to the block of C it should compute.
    # This is done in a grouped ordering to promote L2 data reuse.
    pid = tl.program_id(axis=0)
-    # num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
+    num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
-    # num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    # num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    # group_id = pid // num_pid_in_group
+    group_id = pid // num_pid_in_group
-    # first_pid_m = group_id * GROUP_SIZE_M
+    first_pid_m = group_id * GROUP_SIZE_M
-    # group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    # pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-    # pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_n = (pid % num_pid_in_group) // group_size_m
-    if GROUP_SIZE_M ==1:
-        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-        pid_m = pid // num_pid_n
-        pid_n = pid % num_pid_n
-    else:
-        num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M)
-        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-        num_pid_in_group = GROUP_SIZE_M * num_pid_n
-        group_id = pid // num_pid_in_group
-        first_pid_m = group_id * GROUP_SIZE_M
-        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-        pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
-        pid_n = (pid % num_pid_in_group) // group_size_m
    # ----------------------------------------------------------
@@ -1967,7 +1954,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
-        return not moe_parallel_config.use_fi_all2allv_kernels
+        return True
    def supports_chunking(self) -> bool:
        return True

--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -5,7 +5,6 @@ from abc import abstractmethod
 import torch
-import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
@@ -26,120 +25,4 @@ class FusedMoEMethodBase(QuantizeMethodBase):
    def __init__(self, moe: FusedMoEConfig):
        super().__init__()
        self.moe: FusedMoEConfig = moe
        self.moe_quant_config: FusedMoEQuantConfig | None = None
-        self.moe_mk: mk.FusedMoEModularKernel | None = None
\ No newline at end of file
-    @property
-    def supports_internal_mk(self) -> bool:
-        # NOTE(rob): temporary attribute to indicate support for
-        # completed migration to the new internal MK interface.
-        return self.moe_mk is not None
-    @property
-    def mk_owns_shared_expert(self) -> bool:
-        # NOTE(rob): temporary attribute to indicate support for
-        # completed migration to the new internal MK interface.
-        return self.moe_mk is not None and self.moe_mk.shared_experts is not None
-    @abstractmethod
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        raise NotImplementedError
-    def uses_weight_scale_2_pattern(self) -> bool:
-        """
-        Returns True if this quantization method uses 'weight_scale_2' pattern
-        for per-tensor weight scales (e.g., FP4 variants), False otherwise.
-        This method should be overridden by subclasses that use the
-        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
-        """
-        return False
-    def maybe_make_prepare_finalize(
-        self,
-        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
-    ) -> FusedMoEPrepareAndFinalize | None:
-        from .all2all_utils import maybe_make_prepare_finalize
-        return maybe_make_prepare_finalize(
-            self.moe, self.moe_quant_config, routing_tables
-        )
-    def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        layer: torch.nn.Module,
-    ) -> FusedMoEPermuteExpertsUnpermute:
-        # based on the all2all implementation, select the appropriate
-        # gemm implementation
-        raise NotImplementedError(
-            f"{self.__class__.__name__} must select appropriate gemm "
-            "implementation based on the prepare_finalize"
-        )
-    def prepare_dp_allgather_tensor(
-        self,
-        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
-        """Hook to prepare tensors and extra tensors for DP allgather + EP dispatch."""
-        raise NotImplementedError(
-            "Method 'prepare_dp_allgather_tensor' is not implemented in "
-            f"{self.__class__.__name__}."
-        )
-    @abstractmethod
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        raise NotImplementedError
-    @property
-    def topk_indices_dtype(self) -> torch.dtype | None:
-        if self.moe_mk is not None:
-            return self.moe_mk.prepare_finalize.topk_indices_dtype()
-        return None
-    @property
-    def supports_eplb(self) -> bool:
-        return False
-    @property
-    def allow_inplace(self) -> bool:
-        return False
-    @property
-    def method_name(self) -> str:
-        return self.__class__.__name__
-    @property
-    def is_monolithic(self) -> bool:
-        return False
-    # @abstractmethod
-    def apply(
-        self,
-        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
-        x: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        raise NotImplementedError
-    # @abstractmethod
-    def apply_monolithic(
-        self,
-        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        raise NotImplementedError
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -30,11 +30,11 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
    ):
        super().__init__(old_quant_method.moe)
        self.moe_quant_config = old_quant_method.moe_quant_config
-        self.moe_mk = experts
+        self.fused_experts = experts
        self.disable_expert_map = getattr(
            old_quant_method,
            "disable_expert_map",
-            not self.moe_mk.supports_expert_map(),
+            not self.fused_experts.supports_expert_map(),
        )
        self.old_quant_method = old_quant_method
        assert not self.old_quant_method.is_monolithic
@@ -57,6 +57,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
            ),
        )
+    @property
+    def topk_indices_dtype(self) -> torch.dtype | None:
+        return self.fused_experts.prepare_finalize.topk_indices_dtype()
    @property
    def supports_eplb(self) -> bool:
        return self.old_quant_method.supports_eplb
@@ -92,8 +96,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        assert self.moe_mk is not None
+        return self.fused_experts(
-        return self.moe_mk(
            hidden_states=x,
            w1=layer.w13_weight,
            w2=layer.w2_weight,