sync v0.15.1

c721b814 · zhuwenwen · d53fe7e5 · c721b814 · c721b814 · d53fe7e5
Commit c721b814 authored Feb 05, 2026 by zhuwenwen
20 changed files
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -87,7 +87,6 @@ if TYPE_CHECKING:
    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
    VLLM_PLUGINS: list[str] | None = None
    VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
-    VLLM_LORA_RESOLVER_HF_REPO_LIST: str | None = None
    # Deprecated env variables for profiling, kept for backward compatibility
    # See also vllm/config/profiler.py and `--profiler-config` argument
    VLLM_TORCH_CUDA_PROFILE: str | None = None
@@ -289,11 +288,16 @@ def use_aot_compile() -> bool:
    from vllm.model_executor.layers.batch_invariant import (
        vllm_is_batch_invariant,
    )
+    from vllm.platforms import current_platform
    from vllm.utils.torch_utils import is_torch_equal_or_newer

    default_value = (
        "1"
-        if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
+        if is_torch_equal_or_newer("2.10.0.dev")
+        and not disable_compile_cache()
+        # Disabling AOT_COMPILE for CPU
+        # See: https://github.com/vllm-project/vllm/issues/32033
+        and not current_platform.is_cpu()
        else "0"
    )

@@ -870,13 +874,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv(
        "VLLM_LORA_RESOLVER_CACHE_DIR", None
    ),
-    # A remote HF repo(s) containing one or more LoRA adapters, which
-    # may be downloaded and leveraged as needed. Only works if plugins
-    # are enabled and VLLM_ALLOW_RUNTIME_LORA_UPDATING is enabled.
-    # Values should be comma separated.
-    "VLLM_LORA_RESOLVER_HF_REPO_LIST": lambda: os.getenv(
-        "VLLM_LORA_RESOLVER_HF_REPO_LIST", None
-    ),
    # Enables torch CUDA profiling if set to 1.
    # Deprecated, see profiler_config.
    "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"),
@@ -884,7 +881,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Deprecated, see profiler_config.
    "VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"),
    # Enable torch profiler to record shapes if set to 1.
-    # Deprecated, see profiler_config.
+# Deprecated, see profiler_config.
    "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: (
        os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES")
    ),

--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from vllm.logging_utils.access_log_filter import (
-    UvicornAccessLogFilter,
-    create_uvicorn_log_config,
-)
 from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter
 from vllm.logging_utils.lazy import lazy
 from vllm.logging_utils.log_time import logtime
@@ -12,8 +8,6 @@ from vllm.logging_utils.log_time import logtime
 __all__ = [
    "NewLineFormatter",
    "ColoredFormatter",
-    "UvicornAccessLogFilter",
-    "create_uvicorn_log_config",
    "lazy",
    "logtime",
 ]
--- a/vllm/logging_utils/access_log_filter.py
+++ b/vllm/logging_utils/access_log_filter.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Access log filter for uvicorn to exclude specific endpoints from logging.
-
-This module provides a logging filter that can be used to suppress access logs
-for specific endpoints (e.g., /health, /metrics) to reduce log noise in
-production environments.
-"""
-
-import logging
-from urllib.parse import urlparse
-
-
-class UvicornAccessLogFilter(logging.Filter):
-    """
-    A logging filter that excludes access logs for specified endpoint paths.
-
-    This filter is designed to work with uvicorn's access logger. It checks
-    the log record's arguments for the request path and filters out records
-    matching the excluded paths.
-
-    Uvicorn access log format:
-        '%s - "%s %s HTTP/%s" %d'
-        (client_addr, method, path, http_version, status_code)
-
-    Example:
-        127.0.0.1:12345 - "GET /health HTTP/1.1" 200
-
-    Args:
-        excluded_paths: A list of URL paths to exclude from logging.
-                       Paths are matched exactly.
-                       Example: ["/health", "/metrics"]
-    """
-
-    def __init__(self, excluded_paths: list[str] | None = None):
-        super().__init__()
-        self.excluded_paths = set(excluded_paths or [])
-
-    def filter(self, record: logging.LogRecord) -> bool:
-        """
-        Determine if the log record should be logged.
-
-        Args:
-            record: The log record to evaluate.
-
-        Returns:
-            True if the record should be logged, False otherwise.
-        """
-        if not self.excluded_paths:
-            return True
-
-        # This filter is specific to uvicorn's access logs.
-        if record.name != "uvicorn.access":
-            return True
-
-        # The path is the 3rd argument in the log record's args tuple.
-        # See uvicorn's access logging implementation for details.
-        log_args = record.args
-        if isinstance(log_args, tuple) and len(log_args) >= 3:
-            path_with_query = log_args[2]
-            # Get path component without query string.
-            if isinstance(path_with_query, str):
-                path = urlparse(path_with_query).path
-                if path in self.excluded_paths:
-                    return False
-
-        return True
-
-
-def create_uvicorn_log_config(
-    excluded_paths: list[str] | None = None,
-    log_level: str = "info",
-) -> dict:
-    """
-    Create a uvicorn logging configuration with access log filtering.
-
-    This function generates a logging configuration dictionary that can be
-    passed to uvicorn's `log_config` parameter. It sets up the access log
-    filter to exclude specified paths.
-
-    Args:
-        excluded_paths: List of URL paths to exclude from access logs.
-        log_level: The log level for uvicorn loggers.
-
-    Returns:
-        A dictionary containing the logging configuration.
-
-    Example:
-        >>> config = create_uvicorn_log_config(["/health", "/metrics"])
-        >>> uvicorn.run(app, log_config=config)
-    """
-    config = {
-        "version": 1,
-        "disable_existing_loggers": False,
-        "filters": {
-            "access_log_filter": {
-                "()": UvicornAccessLogFilter,
-                "excluded_paths": excluded_paths or [],
-            },
-        },
-        "formatters": {
-            "default": {
-                "()": "uvicorn.logging.DefaultFormatter",
-                "fmt": "%(levelprefix)s %(message)s",
-                "use_colors": None,
-            },
-            "access": {
-                "()": "uvicorn.logging.AccessFormatter",
-                "fmt": '%(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s',  # noqa: E501
-            },
-        },
-        "handlers": {
-            "default": {
-                "formatter": "default",
-                "class": "logging.StreamHandler",
-                "stream": "ext://sys.stderr",
-            },
-            "access": {
-                "formatter": "access",
-                "class": "logging.StreamHandler",
-                "stream": "ext://sys.stdout",
-                "filters": ["access_log_filter"],
-            },
-        },
-        "loggers": {
-            "uvicorn": {
-                "handlers": ["default"],
-                "level": log_level.upper(),
-                "propagate": False,
-            },
-            "uvicorn.error": {
-                "level": log_level.upper(),
-                "handlers": ["default"],
-                "propagate": False,
-            },
-            "uvicorn.access": {
-                "handlers": ["access"],
-                "level": log_level.upper(),
-                "propagate": False,
-            },
-        },
-    }
-    return config
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -62,7 +62,6 @@ def _fused_moe_lora_kernel(
    num_experts,
    lora_ids,
    adapter_enabled,
-    max_loras,  # <<< PR2: rename, used for masks when grid axis-2 != max_loras
    # The stride variables represent how much to increase the ptr by when
    # moving by 1 element in a particular dimension. E.g. `stride_am` is
    # how much to increase `a_ptr` by to get the element one row down
@@ -84,7 +83,6 @@ def _fused_moe_lora_kernel(
    num_slice_c: tl.constexpr,
    top_k: tl.constexpr,
    MUL_ROUTED_WEIGHT: tl.constexpr,
-    USE_B_L2_CACHE: tl.constexpr,  # new, enable .ca load for B
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_K: tl.constexpr,
@@ -106,13 +104,10 @@ def _fused_moe_lora_kernel(
    if moe_enabled == 0:
        # Early exit for the no moe lora case.
        return
-    # The grid's axis-2 dimension is max_loras + 1 to accommodate the -1 sentinel.
-    # This guard ensures we don't access sorted_token_ids / expert_ids /
-    # num_tokens_post_padded beyond their allocated bounds if an invalid
-    # lora_id somehow appears. Although the caller should pass correct
-    # max_loras, defensive programming prevents accidental out-of-bounds.
-    if lora_id >= max_loras:
-        return
+    # The grid size on axis 2 is (max_loras + 1) to handle the no-lora case
+    # (lora_id == -1), but sorted_token_ids and expert_ids are allocated with
+    # shape (max_loras, ...). Use (num_programs - 1) for correct bounds checking.
+    max_loras = tl.num_programs(axis=2) - 1
    grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)

    # calculate pid_m,pid_n
@@ -141,11 +136,10 @@ def _fused_moe_lora_kernel(
    cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty))
    cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size

-    # remove modulo wrap-around
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int32)
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
    offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)

-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int32)
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
    token_ind = stride_tl * lora_id + offs_token_id
    offs_token = tl.load(
        sorted_token_ids_ptr + token_ind,
@@ -183,11 +177,7 @@ def _fused_moe_lora_kernel(
        # before continuing.
        # pre-fetch lora weight
        # add (offs_bn < N) mask; optional .ca for B
-        b_mask = (offs_k[:, None] < k_remaining) & (offs_bn[None, :] < N)
-        if USE_B_L2_CACHE:
-            b = tl.load(b_ptrs, mask=b_mask, other=0.0, cache_modifier=".ca")
-        else:
-            b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)

        if USE_GDC and not IS_PRIMARY:
            tl.extra.cuda.gdc_wait()
@@ -288,7 +278,6 @@ def _fused_moe_lora_shrink(
        num_experts,
        lora_ids,
        adapter_enabled,
-        lora_a_stacked[0].shape[0],
        qcurr_hidden_states.stride(0),
        qcurr_hidden_states.stride(1),
        w1_lora_a_stacked.stride(0),
@@ -305,7 +294,6 @@ def _fused_moe_lora_shrink(
        num_slice_c=num_slices,
        top_k=1 if mul_routed_weight else top_k_num,
        MUL_ROUTED_WEIGHT=False,
-        USE_B_L2_CACHE=True,  # new
        IS_PRIMARY=True,
        **shrink_config,
    )
@@ -391,7 +379,6 @@ def _fused_moe_lora_expand(
        num_experts,
        lora_ids,
        adapter_enabled,
-        lora_b_stacked[0].shape[0],
        a_intermediate_cache1.stride(0),
        a_intermediate_cache1.stride(1),
        w1_lora_b_stacked.stride(0),
@@ -408,7 +395,6 @@ def _fused_moe_lora_expand(
        num_slice_c=num_slices,
        top_k=1,
        MUL_ROUTED_WEIGHT=mul_routed_weight,
-        USE_B_L2_CACHE=True,  # new
        IS_PRIMARY=False,
        **expand_config,
    )

--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -1554,7 +1554,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
    ):
        assert isinstance(prefill, FlashInferPrefillMetadata)
-
        attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
            q=q,
            k=k,

--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -7,27 +7,18 @@ import torch
 from vllm.distributed import (
    get_ep_group,
 )
-from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
    FusedMoEQuantConfig,
 )
-from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import (
-    FlashInferA2APrepareAndFinalize,
-)
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEPrepareAndFinalize,
 )
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNaiveEP,
-    MoEPrepareAndFinalizeNoEP,
-)
+
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_deep_ep, has_mori, has_pplx

-logger = init_logger(__name__)
-
 if current_platform.is_cuda_alike():
    if has_pplx():
        from .pplx_prepare_finalize import (
@@ -80,45 +71,19 @@ def maybe_make_prepare_finalize(
    moe: FusedMoEConfig,
    quant_config: FusedMoEQuantConfig | None,
    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
-    allow_new_interface: bool = False,
 ) -> FusedMoEPrepareAndFinalize | None:
-    # NOTE(rob): we are migrating each quant_method to hold the MK
-    # in all cases. The allow_new_interface=False flag allow us to fall
-    # back to the old method for methods that have not yet been migrated.
-    #
-    # In old method:
-    #   * maybe_init_modular_kernel() calls this function. If we are
-    #     using no Dp/Ep or naive all2all, we return None this function
-    #     returns None and no ModularKernelMethod is created. If non-naive
-    #     all2all is used, this returns a PrepareAndFinalize object and
-    #     a ModularKernelMethod is created.
-    # In new method:
-    #   * maybe_make_prepare_finalize() is called from the oracle. We
-    #     always return a PrepareAndFinalize object and the quant method
-    #     holds the ModularKernel.
    if not moe.moe_parallel_config.use_all2all_kernels:
-        if not allow_new_interface:
-            return None
-
-        # For DP/TP case, fall back to naive P/F.
-        if moe.moe_parallel_config.dp_size > 1:
-            logger.info_once(
-                "Detected DP deployment with no --enable-expert-parallel. "
-                "Falling back to AllGather+ReduceScatter dispatch/combine."
-            )
-            return MoEPrepareAndFinalizeNaiveEP(
-                is_sequence_parallel=moe.moe_parallel_config.is_sequence_parallel,
-                num_dispatchers=(
-                    get_ep_group().device_communicator.all2all_manager.world_size
-                ),
-            )
-        else:
-            return MoEPrepareAndFinalizeNoEP()
+        return None

    all2all_manager = get_ep_group().device_communicator.all2all_manager
    assert all2all_manager is not None

    prepare_finalize: FusedMoEPrepareAndFinalize | None = None
+    
+    # TODO(rob): update this as part of the MoE refactor.
+    assert not moe.use_flashinfer_cutlass_kernels, (
+        "Must be created in modelopt.py or fp8.py"
+    )

    if moe.use_pplx_kernels:
        assert quant_config is not None
@@ -239,16 +204,4 @@ def maybe_make_prepare_finalize(
            use_fp8_dispatch=use_fp8_dispatch,
        )

-    elif moe.use_fi_all2allv_kernels:
-        assert quant_config is not None
-        prepare_finalize = FlashInferA2APrepareAndFinalize(
-            num_dispatchers=all2all_manager.world_size,
-        )
-
-    elif moe.use_naive_all2all_kernels and allow_new_interface:
-        prepare_finalize = MoEPrepareAndFinalizeNaiveEP(
-            is_sequence_parallel=(moe.moe_parallel_config.is_sequence_parallel),
-            num_dispatchers=all2all_manager.world_size,
-        )
-
    return prepare_finalize
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -20,6 +20,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.math_utils import cdiv

@@ -861,7 +862,6 @@ class FusedMoEParallelConfig:

    use_ep: bool  # whether to use EP or not
    all2all_backend: str  # all2all backend for MoE communication
-    is_sequence_parallel: bool  # whether sequence parallelism is used
    enable_eplb: bool  # whether to enable expert load balancing

    @property
@@ -883,12 +883,6 @@ class FusedMoEParallelConfig:
    def use_deepep_ll_kernels(self):
        return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency"

-    @property
-    def use_fi_all2allv_kernels(self):
-        return (
-            self.use_all2all_kernels and self.all2all_backend == "flashinfer_all2allv"
-        )
-
    @property
    def use_batched_activation_format(self):
        return self.use_deepep_ll_kernels or self.use_pplx_kernels
@@ -1020,7 +1014,6 @@ class FusedMoEParallelConfig:
                ep_rank=0,
                use_ep=False,
                all2all_backend=vllm_parallel_config.all2all_backend,
-                is_sequence_parallel=vllm_parallel_config.use_sequence_parallel_moe,
                enable_eplb=vllm_parallel_config.enable_eplb,
            )
        # DP + EP / TP + EP / DP + TP + EP
@@ -1040,7 +1033,6 @@ class FusedMoEParallelConfig:
            ep_rank=ep_rank,
            use_ep=True,
            all2all_backend=vllm_parallel_config.all2all_backend,
-            is_sequence_parallel=vllm_parallel_config.use_sequence_parallel_moe,
            enable_eplb=vllm_parallel_config.enable_eplb,
        )

@@ -1059,7 +1051,6 @@ class FusedMoEParallelConfig:
            use_ep=False,
            all2all_backend="naive",
            enable_eplb=False,
-            is_sequence_parallel=False,
        )


@@ -1154,9 +1145,12 @@ class FusedMoEConfig:
        return self.moe_parallel_config.use_mori_kernels

    @property
-    def use_fi_all2allv_kernels(self):
-        return self.moe_parallel_config.use_fi_all2allv_kernels
-
-    @property
-    def use_naive_all2all_kernels(self):
-        return self.moe_parallel_config.use_naive_all2all_kernels
+    def use_flashinfer_cutlass_kernels(self):
+        """
+        Whether to use FlashInfer cutlass kernels for NVFP4 MoE.
+        """
+        return (
+            envs.VLLM_USE_FLASHINFER_MOE_FP4
+            and has_flashinfer_cutlass_fused_moe()
+            and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput"
+        )
--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=128,device_name=BW200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=128,device_name=BW200.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=128,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=128,device_name=BW200_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=BW200_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=DCU_K100_AI_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=16,N=1024,device_name=K100_AI_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 4,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=BW200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=BW200.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "2": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "256": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "512": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=BW200_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=DCU_K100_AI_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=K100_AI_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=BW200_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=DCU_K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=DCU_K100_AI_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    }
-}
--- a/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=32,N=512,device_name=K100_AI_nn.json
-{
-    "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "4": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "16": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "24": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "32": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "48": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "64": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "96": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "128": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 2,
-        "num_ldmatrixes": 1
-    },
-    "1024": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "1536": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "2048": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    },
-    "4096": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 4,
-        "num_stages": 3,
-        "num_ldmatrixes": 1
-    }
-}