sync v0.18.0

eefa41c1 · zhuwenwen · 82155c76 · eefa41c1 · eefa41c1 · eefa41c1
Commit eefa41c1 authored Mar 24, 2026 by zhuwenwen
20 changed files
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -74,6 +74,7 @@ class BaseFrontendArgs:
    like SSL, CORS, and HTTP server settings. Those arguments are added by
    the subclasses.
    """
    lora_modules: list[LoRAModulePath] | None = None
    """LoRA modules configurations in either 'name=path' format or JSON format
    or JSON list format. Example (old format): `'name=path'` Example (new

--- a/vllm/entrypoints/openai/engine/protocol.py
+++ b/vllm/entrypoints/openai/engine/protocol.py
@@ -216,6 +216,10 @@ def get_logits_processors(
 class FunctionCall(OpenAIBaseModel):
+    # Internal field to preserve native tool call ID from tool parser.
+    # Excluded from serialization to maintain OpenAI API compatibility
+    # (function object should only contain 'name' and 'arguments').
+    id: str | None = Field(default=None, exclude=True)
    name: str
    arguments: str
@@ -263,4 +267,4 @@ class GenerationError(Exception):
    def __init__(self, message: str = "Internal server error"):
        super().__init__(message)
        self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
\ No newline at end of file
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -282,7 +282,6 @@ def use_aot_compile() -> bool:
    from vllm.model_executor.layers.batch_invariant import (
        vllm_is_batch_invariant,
    )
-    from vllm.platforms import current_platform
    from vllm.utils.torch_utils import is_torch_equal_or_newer
    default_value = (

--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.logging_utils.access_log_filter import (
+    UvicornAccessLogFilter,
+    create_uvicorn_log_config,
+)
 from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter
 from vllm.logging_utils.lazy import lazy
 from vllm.logging_utils.log_time import logtime
@@ -8,6 +12,8 @@ from vllm.logging_utils.log_time import logtime
 __all__ = [
    "NewLineFormatter",
    "ColoredFormatter",
+    "UvicornAccessLogFilter",
+    "create_uvicorn_log_config",
    "lazy",
    "logtime",
 ]
\ No newline at end of file
--- a/vllm/logging_utils/access_log_filter.py
+++ b/vllm/logging_utils/access_log_filter.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Access log filter for uvicorn to exclude specific endpoints from logging.
+This module provides a logging filter that can be used to suppress access logs
+for specific endpoints (e.g., /health, /metrics) to reduce log noise in
+production environments.
+"""
+import logging
+from urllib.parse import urlparse
+class UvicornAccessLogFilter(logging.Filter):
+    """
+    A logging filter that excludes access logs for specified endpoint paths.
+    This filter is designed to work with uvicorn's access logger. It checks
+    the log record's arguments for the request path and filters out records
+    matching the excluded paths.
+    Uvicorn access log format:
+        '%s - "%s %s HTTP/%s" %d'
+        (client_addr, method, path, http_version, status_code)
+    Example:
+        127.0.0.1:12345 - "GET /health HTTP/1.1" 200
+    Args:
+        excluded_paths: A list of URL paths to exclude from logging.
+                       Paths are matched exactly.
+                       Example: ["/health", "/metrics"]
+    """
+    def __init__(self, excluded_paths: list[str] | None = None):
+        super().__init__()
+        self.excluded_paths = set(excluded_paths or [])
+    def filter(self, record: logging.LogRecord) -> bool:
+        """
+        Determine if the log record should be logged.
+        Args:
+            record: The log record to evaluate.
+        Returns:
+            True if the record should be logged, False otherwise.
+        """
+        if not self.excluded_paths:
+            return True
+        # This filter is specific to uvicorn's access logs.
+        if record.name != "uvicorn.access":
+            return True
+        # The path is the 3rd argument in the log record's args tuple.
+        # See uvicorn's access logging implementation for details.
+        log_args = record.args
+        if isinstance(log_args, tuple) and len(log_args) >= 3:
+            path_with_query = log_args[2]
+            # Get path component without query string.
+            if isinstance(path_with_query, str):
+                path = urlparse(path_with_query).path
+                if path in self.excluded_paths:
+                    return False
+        return True
+def create_uvicorn_log_config(
+    excluded_paths: list[str] | None = None,
+    log_level: str = "info",
+) -> dict:
+    """
+    Create a uvicorn logging configuration with access log filtering.
+    This function generates a logging configuration dictionary that can be
+    passed to uvicorn's `log_config` parameter. It sets up the access log
+    filter to exclude specified paths.
+    Args:
+        excluded_paths: List of URL paths to exclude from access logs.
+        log_level: The log level for uvicorn loggers.
+    Returns:
+        A dictionary containing the logging configuration.
+    Example:
+        >>> config = create_uvicorn_log_config(["/health", "/metrics"])
+        >>> uvicorn.run(app, log_config=config)
+    """
+    config = {
+        "version": 1,
+        "disable_existing_loggers": False,
+        "filters": {
+            "access_log_filter": {
+                "()": UvicornAccessLogFilter,
+                "excluded_paths": excluded_paths or [],
+            },
+        },
+        "formatters": {
+            "default": {
+                "()": "uvicorn.logging.DefaultFormatter",
+                "fmt": "%(levelprefix)s %(message)s",
+                "use_colors": None,
+            },
+            "access": {
+                "()": "uvicorn.logging.AccessFormatter",
+                "fmt": '%(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s',  # noqa: E501
+            },
+        },
+        "handlers": {
+            "default": {
+                "formatter": "default",
+                "class": "logging.StreamHandler",
+                "stream": "ext://sys.stderr",
+            },
+            "access": {
+                "formatter": "access",
+                "class": "logging.StreamHandler",
+                "stream": "ext://sys.stdout",
+                "filters": ["access_log_filter"],
+            },
+        },
+        "loggers": {
+            "uvicorn": {
+                "handlers": ["default"],
+                "level": log_level.upper(),
+                "propagate": False,
+            },
+            "uvicorn.error": {
+                "level": log_level.upper(),
+                "handlers": ["default"],
+                "propagate": False,
+            },
+            "uvicorn.access": {
+                "handlers": ["access"],
+                "level": log_level.upper(),
+                "propagate": False,
+            },
+        },
+    }
+    return config
\ No newline at end of file
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -175,6 +175,7 @@ def _fused_moe_lora_kernel(
    top_k_num,
    lora_ids,
    adapter_enabled,
+    max_loras,  # <<< PR2: rename, used for masks when grid axis-2 != max_loras
    # The stride variables represent how much to increase the ptr by when
    # moving by 1 element in a particular dimension. E.g. `stride_am` is
    # how much to increase `a_ptr` by to get the element one row down
@@ -225,7 +226,6 @@ def _fused_moe_lora_kernel(
 ):
    pid = tl.program_id(axis=0)
    slice_id = tl.program_id(axis=1)
    grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)
    # calculate pid_m,pid_n
@@ -296,7 +296,6 @@ def _fused_moe_lora_kernel(
    cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty))
    cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size
    offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
    token_mask = offs_token < num_valid_tokens
@@ -504,6 +503,7 @@ def _fused_moe_lora_shrink(
        top_k_num,
        lora_ids,
        adapter_enabled,
+        lora_a_stacked[0].shape[0],
        qcurr_hidden_states.stride(0),
        qcurr_hidden_states.stride(1),
        w1_lora_a_stacked.stride(0),
@@ -639,6 +639,7 @@ def _fused_moe_lora_expand(
        top_k_num,
        lora_ids,
        adapter_enabled,
+        lora_b_stacked[0].shape[0],
        a_intermediate_cache1.stride(0),
        a_intermediate_cache1.stride(1),
        w1_lora_b_stacked.stride(0),

--- a/vllm/model_executor/kernels/linear/__init__.py
+++ b/vllm/model_executor/kernels/linear/__init__.py
@@ -82,7 +82,6 @@ from vllm.model_executor.kernels.linear.scaled_mm.rocm import (
 from vllm.model_executor.kernels.linear.scaled_mm.triton import (
    TritonInt8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.xpu import (
    XPUFP8ScaledMMLinearKernel,
 )

--- a/vllm/model_executor/kernels/linear/scaled_mm/rocm.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/rocm.py
@@ -29,7 +29,6 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl(
        and B.shape[0] % 16 == 0  # M TODO: needed?
        and B.shape[1] % 16 == 0  # K
        and ((bias is None) or (bias.dtype == out_dtype))
-        and A.is_contiguous()
    ):
        output = ops.wvSplitKQ(
            B.t(),
@@ -116,4 +115,4 @@ class ROCmFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
        output = torch.ops.vllm.rocm_per_tensor_float_w8a8_scaled_mm_impl(
            A, B, out_dtype, As, Bs, bias
        )
        return torch.narrow(output, 0, 0, A.shape[0]).view(*output_shape)
\ No newline at end of file
--- a/vllm/model_executor/kernels/linear/xpu.py
+++ b/vllm/model_executor/kernels/linear/xpu.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-import torch
-from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
-    FP8ScaledMMLinearKernel,
-    FP8ScaledMMLinearLayerConfig,
-)
-from vllm.platforms import current_platform
-class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
-    @classmethod
-    def is_supported(
-        cls, compute_capability: int | None = None
-    ) -> tuple[bool, str | None]:
-        if not current_platform.is_xpu():
-            return False, "XPUFP8ScaledMM only support on XPU"
-        return True, None
-    @classmethod
-    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
-        if c.weight_quant_key.dtype not in {torch.float8_e5m2, torch.float8_e4m3fn}:
-            return False, "XPUFP8ScaledMM only support FP8 weight dtype"
-        return True, None
-    def __init__(
-        self, c: FP8ScaledMMLinearLayerConfig, layer_param_names: Sequence[str]
-    ) -> None:
-        assert self.can_implement(c)[0]
-        assert self.is_supported()[0]
-        self.config = c
-        self.layer_param_names = layer_param_names
-    def apply_weights(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        weight = layer.weight
-        weight_scale = layer.weight_scale
-        return torch.ops._xpu_C.fp8_gemm_w8a16(x, weight, weight_scale, bias)
-    def apply_scaled_mm(
-        self,
-        *,
-        A: torch.Tensor,
-        B: torch.Tensor,
-        out_dtype: torch.dtype,
-        As: torch.Tensor,
-        Bs: torch.Tensor,
-        bias: torch.Tensor | None,
-        output_shape: list,
-    ) -> torch.Tensor:
-        pass
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -2291,12 +2291,14 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
        self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
    ):
        assert isinstance(prefill, FlashInferPrefillMetadata)
        attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
            q=q,
            k=k,
            v=v,
            return_lse=True,
        )
        # Convert from (q_len, num_heads) to (num_heads, q_len)
        return attn_out, lse.transpose(0, 1).contiguous()
@@ -2730,4 +2732,4 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
        attn_metadata: M,
        layer: AttentionLayer,
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
        raise NotImplementedError
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -9,6 +9,7 @@ from vllm.config import get_current_vllm_config
 from vllm.distributed import (
    get_ep_group,
 )
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEParallelConfig,
@@ -30,6 +31,8 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_deep_ep, has_mori, has_nixl_ep
+logger = init_logger(__name__)
 if current_platform.is_cuda_alike():
    if has_deep_ep():
        from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
@@ -90,6 +93,20 @@ def maybe_make_prepare_finalize(
    allow_new_interface: bool = False,
    use_monolithic: bool = False,
 ) -> FusedMoEPrepareAndFinalize | None:
+    # NOTE(rob): we are migrating each quant_method to hold the MK
+    # in all cases. The allow_new_interface=False flag allow us to fall
+    # back to the old method for methods that have not yet been migrated.
+    #
+    # In old method:
+    #   * maybe_init_modular_kernel() calls this function. If we are
+    #     using no Dp/Ep or naive all2all, we return None this function
+    #     returns None and no ModularKernelMethod is created. If non-naive
+    #     all2all is used, this returns a PrepareAndFinalize object and
+    #     a ModularKernelMethod is created.
+    # In new method:
+    #   * maybe_make_prepare_finalize() is called from the oracle. We
+    #     always return a PrepareAndFinalize object and the quant method
+    #     holds the ModularKernel.
    if not moe.moe_parallel_config.use_all2all_kernels:
        if not allow_new_interface:
            return None
@@ -114,11 +131,6 @@ def maybe_make_prepare_finalize(
    assert all2all_manager is not None
    prepare_finalize: FusedMoEPrepareAndFinalize | None = None
-    # TODO(rob): update this as part of the MoE refactor.
-    assert not moe.use_flashinfer_cutlass_kernels, (
-        "Must be created in modelopt.py or fp8.py"
-    )
    if moe.use_deepep_ht_kernels:
        assert moe.dp_size == all2all_manager.dp_world_size
@@ -258,4 +270,5 @@ def maybe_make_prepare_finalize(
            physical_to_global=physical_to_global,
            local_expert_global_ids=local_expert_global_ids,
        )
-    return prepare_finalize
+    return prepare_finalize
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -17,7 +17,6 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
 )
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.math_utils import cdiv
@@ -1262,4 +1261,4 @@ class FusedMoEConfig:
    @property
    def use_nixl_ep_kernels(self):
        return self.moe_parallel_config.use_nixl_ep_kernels
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -107,7 +107,13 @@ def run_cutlass_moe_fp8(
    ), "Intermediate scale shape mismatch"
    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
-    if expert_map is not None:
+    # NOTE(rob): the expert_map is used for the STANDARD case and
+    # the batched format is used by the BATCHED case.
+    # TODO(rob): update the MK interface to only pass the expert_map
+    # during the STANDARD case to make this clearer across all kernels.
+    if use_batched_format:
+        assert expert_num_tokens is not None
+    else:
        assert expert_num_tokens is None
    # We have two modes: batched experts and non-batched experts.
@@ -1188,4 +1194,4 @@ def cutlass_moe_w4a8_fp8(
        global_num_experts=num_experts,
        expert_map=expert_map,
        apply_router_weight_on_input=apply_router_weight_on_input,
    )
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -103,6 +103,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
        num_experts: int,
        a1_scale: torch.Tensor | None,
        quant_config: FusedMoEQuantConfig,
+        defer_input_quant: bool,
    ) -> Callable:
        has_scales = token_scales is not None
@@ -174,6 +175,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
            expert_topk_weights,
            a1_scale,
            quant_config,
+            defer_input_quant=defer_input_quant,
        )
    def _receiver(
@@ -187,6 +189,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
        expert_topk_weights: torch.Tensor | None,
        a1_scale: torch.Tensor | None,
        quant_config: FusedMoEQuantConfig,
+        defer_input_quant: bool,
    ) -> mk.PrepareResultType:
        if event.event is not None:
            event.current_stream_wait()
@@ -221,14 +224,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
            expert_num_tokens_per_expert_list, device=expert_x.device
        )
-        # Dispatch and Quant
+        # * For non-block quant, dispatch in b16 and quantize now as
-        # DeepEP kernels only support dispatching block-quantized
+        #   DeepEP kernels only support dispatching block scales.
-        # activation scales.
+        # * For expert kernels that require unquantized inputs,
-        # Dispatch in bfloat16 and quantize afterwards
+        #   defer quantization to FusedMoEExpertsPermuteUnpermute.
-        if not quant_config.is_block_quantized:
+        if not quant_config.is_block_quantized and not defer_input_quant:
            # Quantize after dispatch.
            expert_x_scale = None
            if expert_x.numel() != 0:
+                # TODO: support per_act_token_quant,
                expert_x, expert_x_scale = moe_kernel_quantize_input(
                    expert_x,
                    a1_scale,
@@ -258,6 +262,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
+        defer_input_quant: bool = False,
    ) -> mk.ReceiverType:
        if apply_router_weight_on_input:
            topk = topk_ids.size(1)
@@ -267,8 +272,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
            )
            a1 = a1 * topk_weights.to(a1.dtype)
-        if quant_config.is_block_quantized:
+        # * DeepEP only supports fp8 block scales so quantize
-            # Quant and Dispatch
+        #   before the dispatch for these models.
+        # * For all other quantization, dispatch after.
+        # * For expert kernels that require unquantized inputs,
+        #   defer quantization to FusedMoEExpertsPermuteUnpermute.
+        if quant_config.is_block_quantized and not defer_input_quant:
            a1q, a1q_scale = moe_kernel_quantize_input(
                a1,
                quant_config.a1_scale,
@@ -282,7 +291,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
        else:
            a1q = a1
            a1q_scale = None
-            a1_post_scale = quant_config.a1_scale
+            a1_post_scale = (
+                quant_config.a1_gscale
+                if quant_config.quant_dtype == "nvfp4"
+                else quant_config.a1_scale
+            )
        return self._do_dispatch(
            tokens=a1q,
@@ -292,6 +305,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
            num_experts=num_experts,
            a1_scale=a1_post_scale,
            quant_config=quant_config,
+            defer_input_quant=defer_input_quant,
        )
    def prepare(
@@ -303,6 +317,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
+        defer_input_quant: bool = False,
    ) -> mk.PrepareResultType:
        receiver = self.prepare_async(
            a1,
@@ -312,6 +327,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
            expert_map,
            apply_router_weight_on_input,
            quant_config,
+            defer_input_quant,
        )
        return receiver()
@@ -418,4 +434,4 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
            apply_router_weight_on_input,
            weight_and_reduce_impl,
            False,
        )
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -241,7 +241,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
+        defer_input_quant: bool = False,
    ) -> tuple[Callable, mk.ReceiverType]:
+        if defer_input_quant:
+            raise NotImplementedError(
+                f"{self.__class__.__name__} does not support defer_input_quant=True. "
+                "Please select an MoE kernel that accepts quantized inputs."
+            )
        hidden_size = a1.size(1)
        assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, (
@@ -344,7 +350,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
+        defer_input_quant: bool = False,
    ) -> mk.PrepareResultType:
+        if defer_input_quant:
+            raise NotImplementedError(
+                f"{self.__class__.__name__} does not support defer_input_quant=True. "
+                "Please select an MoE kernel that accepts quantized inputs."
+            )
        hook, receiver = self.prepare_async(
            a1,
            topk_weights,
@@ -433,4 +445,4 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
            apply_router_weight_on_input,
            weight_and_reduce_impl,
            do_async=False,
        )
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -117,16 +117,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
                    dtype=torch.float32,
                )
-    @staticmethod
+    @property
-    def expects_unquantized_inputs(
+    def expects_unquantized_inputs(self) -> bool:
-        moe_config: mk.FusedMoEConfig, quant_config: FusedMoEQuantConfig
+        return self.quant_config.use_fp8_w8a8 and self.quant_config.is_block_quantized
-    ) -> bool:
-        # NVFP4 TP kernels and FP8 block-quantized kernels apply
-        # input quantization inside FusedMoEPermuteExpertsUnpermute.
-        return (
-            quant_config.use_nvfp4_w4a4
-            and not moe_config.moe_parallel_config.use_all2all_kernels
-        ) or (quant_config.use_fp8_w8a8 and quant_config.is_block_quantized)
    @staticmethod
    def _supports_current_device() -> bool:
@@ -241,9 +234,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
        """
        workspace1 = (M, K)
        workspace2 = (0,)
+        # For NVFP4, the output is stored in a packed int8 format,
-        # For TP, the quantization is fused with fused_moe call.
+        # so the actual hidden dim is 2x the size of K here.
-        output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" and self.use_dp else K)
+        output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" else K)
        # The workspace is determined by `aq`, since it comes after any
        # potential communication op and is involved in the expert computation.
        return (workspace1, workspace2, output_shape)
@@ -403,4 +396,4 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
        # No support for LoRA in flashinfer_cutlass_fused_moe.
        # See TODOs in flashinfer functions runMoe and runMoeMinLatency.
        raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe")
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.distributed import get_dp_group, get_ep_group
-from vllm.distributed.device_communicators.base_device_communicator import (
-    All2AllManagerBase,
-)
-from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
-from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceNoOP,
-)
-from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
-from vllm.utils.flashinfer import nvfp4_block_scale_interleave
-def get_local_sizes():
-    return get_forward_context().dp_metadata.get_chunk_sizes_across_dp_rank()
-class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
-    """Base class for FlashInfer MoE prepare and finalize operations."""
-    def __init__(
-        self,
-        use_dp: bool,
-        num_dispatchers: int = 1,
-        use_deepseek_fp8_block_scale: bool = False,
-    ):
-        super().__init__()
-        self.num_dispatchers_ = num_dispatchers
-        self.use_dp = use_dp
-        self.local_tokens = None
-        # Toggle for DeepSeek-style FP8 block-scale path where activations are
-        # not quantized here and weight block scales are consumed by the kernel.
-        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
-    @property
-    def activation_format(self) -> mk.FusedMoEActivationFormat:
-        return mk.FusedMoEActivationFormat.Standard
-    def max_num_tokens_per_rank(self) -> int | None:
-        return None
-    def topk_indices_dtype(self) -> torch.dtype | None:
-        return None
-    def num_dispatchers(self) -> int:
-        return self.num_dispatchers_
-    def output_is_reduced(self) -> bool:
-        return False
-    def _apply_router_weight_on_input(
-        self,
-        a1: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        apply_router_weight_on_input: bool,
-    ) -> None:
-        """Apply router weight on input if needed."""
-        if apply_router_weight_on_input:
-            topk = topk_ids.size(1)
-            assert topk == 1, (
-                "apply_router_weight_on_input is only implemented for topk=1"
-            )
-            a1.mul_(topk_weights.to(a1.dtype))
-class FlashInferAllToAllMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
-    """FlashInfer implementation using AllToAll communication."""
-    def __init__(
-        self,
-        use_dp: bool,
-        num_dispatchers: int = 1,
-        use_deepseek_fp8_block_scale: bool = False,
-    ):
-        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
-        self.alltoall_info = None
-        # Initialize all2all_manager only for DP case
-        self.all2all_manager = None
-        if self.use_dp:
-            self.all2all_manager = get_ep_group().device_communicator.all2all_manager
-    def prepare(
-        self,
-        a1: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_experts: int,
-        expert_map: torch.Tensor | None,
-        apply_router_weight_on_input: bool,
-        quant_config: FusedMoEQuantConfig,
-    ) -> mk.PrepareResultType:
-        self._apply_router_weight_on_input(
-            a1, topk_weights, topk_ids, apply_router_weight_on_input
-        )
-        if not self.use_dp:
-            # Non-DP case: quantize activations unless using block-scale path
-            if not self.use_deepseek_fp8_block_scale:
-                a1q, a1q_scale = moe_kernel_quantize_input(
-                    a1,
-                    quant_config.a1_gscale,
-                    quant_config.quant_dtype,
-                    quant_config.per_act_token_quant,
-                    quant_config.block_shape,
-                    is_fp4_scale_swizzled=not self.use_dp,
-                )
-            else:
-                a1q = a1
-                a1q_scale = None
-        else:
-            # DP case: use FlashInfer AllToAll
-            global_num_tokens_cpu = get_local_sizes()
-            top_k = topk_ids.size(1)
-            (self.alltoall_info, topk_ids, topk_weights, a1q, a1q_scale) = (
-                flashinfer_alltoall_dispatch(
-                    self.all2all_manager,
-                    global_num_tokens_cpu,
-                    a1,
-                    quant_config.a1_gscale,
-                    topk_ids,
-                    topk_weights,
-                    top_k,
-                    num_experts,
-                    quant_config,
-                    use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
-                )
-            )
-        return a1q, a1q_scale, None, topk_ids, topk_weights
-    def finalize(
-        self,
-        output: torch.Tensor,
-        fused_expert_output: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        apply_router_weight_on_input: bool,
-        weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> None:
-        if self.use_dp:
-            top_k = topk_ids.size(1)
-            token_count = output.shape[0]
-            fused_expert_output = flashinfer_alltoall_combine(
-                self.all2all_manager,
-                fused_expert_output,
-                top_k=top_k,
-                token_count=token_count,
-                alltoall_info=self.alltoall_info,
-            )
-        output.copy_(fused_expert_output)
-class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
-    def __init__(
-        self,
-        use_dp: bool,
-        num_dispatchers: int = 1,
-        use_deepseek_fp8_block_scale: bool = False,
-    ):
-        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
-    def prepare(
-        self,
-        a1: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        num_experts: int,
-        expert_map: torch.Tensor | None,
-        apply_router_weight_on_input: bool,
-        quant_config: FusedMoEQuantConfig,
-    ) -> mk.PrepareResultType:
-        self._apply_router_weight_on_input(
-            a1, topk_weights, topk_ids, apply_router_weight_on_input
-        )
-        is_nvfp4 = quant_config.quant_dtype == "nvfp4"
-        if not self.use_dp and is_nvfp4:
-            return a1, None, None, topk_ids, topk_weights
-        if not self.use_deepseek_fp8_block_scale:
-            a1q, a1q_scale = moe_kernel_quantize_input(
-                a1,
-                quant_config.a1_gscale if is_nvfp4 else quant_config.a1_scale,
-                quant_config.quant_dtype,
-                quant_config.per_act_token_quant,
-                quant_config.block_shape,
-                is_fp4_scale_swizzled=not self.use_dp,
-            )
-        else:
-            # Block-scale path: pass activations through, omit per-token scales
-            a1q = a1
-            a1q_scale = None
-        if self.use_dp:
-            # Build gather list conditionally - omit a1q_scale if None
-            # (block-scale path)
-            gather_list = [topk_weights, topk_ids, a1q]
-            if a1q_scale is not None:
-                gather_list.append(a1q_scale)
-                gathered = get_dp_group().all_gatherv(
-                    gather_list,
-                    dim=0,
-                    sizes=get_local_sizes(),
-                )
-                topk_weights, topk_ids, a1q, a1q_scale = gathered
-            else:
-                gathered = get_dp_group().all_gatherv(
-                    gather_list,
-                    dim=0,
-                    sizes=get_local_sizes(),
-                )
-                topk_weights, topk_ids, a1q = gathered
-                a1q_scale = None
-        if is_nvfp4 and a1q_scale is not None:
-            if a1q_scale.element_size() == 1:
-                a1q_scale = a1q_scale.view(torch.uint8)
-            a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
-        return a1q, a1q_scale, None, topk_ids, topk_weights
-    def finalize(
-        self,
-        output: torch.Tensor,
-        fused_expert_output: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        apply_router_weight_on_input: bool,
-        weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> None:
-        assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceNoOP)
-        if self.use_dp:
-            fused_expert_output = get_dp_group().reduce_scatterv(
-                fused_expert_output, dim=0, sizes=get_local_sizes()
-            )
-        output.copy_(fused_expert_output)
-def flashinfer_alltoall_dispatch(
-    all2all_manager: All2AllManagerBase,
-    global_num_tokens_cpu: list[int],
-    x: torch.Tensor,
-    gs: torch.Tensor,
-    topk_ids: torch.Tensor,
-    topk_weights: torch.Tensor,
-    top_k: int,
-    num_experts: int,
-    quant_config: FusedMoEQuantConfig,
-    use_deepseek_fp8_block_scale: bool = False,
-):
-    from flashinfer.comm.trtllm_alltoall import MnnvlMoe
-    assert all2all_manager.ensure_alltoall_workspace_initialized(), (
-        "FlashInfer AllToAll workspace not available"
-    )
-    ep_rank = all2all_manager.rank
-    ep_size = all2all_manager.world_size
-    max_num_token = (
-        max(global_num_tokens_cpu) if global_num_tokens_cpu is not None else x.shape[0]
-    )
-    orig_topk_weights_dtype = topk_weights.dtype
-    alltoall_info, topk_ids, topk_weights, _ = (
-        MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgather(
-            topk_ids,
-            topk_weights,
-            None,
-            all2all_manager.prepare_workspace_tensor,
-            max_num_token,
-            ep_rank,
-            ep_size,
-            num_experts,
-            num_experts,
-            top_k,
-        )
-    )
-    topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype)
-    if not use_deepseek_fp8_block_scale:
-        x, x_sf = moe_kernel_quantize_input(
-            x,
-            gs,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-            is_fp4_scale_swizzled=False,  # delay swizzle to after comm
-        )
-        x = MnnvlMoe.mnnvl_moe_alltoallv(
-            x,
-            alltoall_info,
-            all2all_manager.workspace_tensor,
-            ep_rank,
-            ep_size,
-        )
-        x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
-            x_sf,
-            alltoall_info,
-            all2all_manager.workspace_tensor,
-            ep_rank,
-            ep_size,
-        )
-        if quant_config.quant_dtype == "nvfp4":
-            x_sf = nvfp4_block_scale_interleave(x_sf)
-    else:
-        # Block-scale path: pass activations through without quantization
-        x_sf = None
-        x = MnnvlMoe.mnnvl_moe_alltoallv(
-            x,
-            alltoall_info,
-            all2all_manager.workspace_tensor,
-            ep_rank,
-            ep_size,
-        )
-    return alltoall_info, topk_ids, topk_weights, x, x_sf
-def flashinfer_alltoall_combine(
-    all2all_manager: All2AllManagerBase,
-    output: torch.Tensor,
-    top_k: int,
-    token_count: int,
-    alltoall_info,
-):
-    from flashinfer.comm.trtllm_alltoall import MnnvlMoe
-    assert all2all_manager.ensure_alltoall_workspace_initialized(), (
-        "FlashInfer AllToAll workspace not available"
-    )
-    return MnnvlMoe.mnnvl_moe_alltoallv_combine(
-        output,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank=all2all_manager.rank,
-        ep_size=all2all_manager.world_size,
-        top_k=top_k,
-        token_count=token_count,
-    )
-def create_flashinfer_prepare_finalize(
-    use_dp: bool,
-    use_nvfp4: bool = False,
-    enable_alltoallv: bool = False,
-    use_deepseek_fp8_block_scale: bool = False,
-) -> FlashInferCutlassMoEPrepareAndFinalize | MoEPrepareAndFinalizeNoEP:
-    """Factory function to create the appropriate FlashInfer implementation."""
-    if use_dp:
-        if enable_alltoallv:
-            assert use_nvfp4
-            return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
-        return FlashInferAllGatherMoEPrepareAndFinalize(
-            use_dp=True,
-            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-        )
-    else:
-        # CUTLASS FP8 BLOCK and CUTLASS NVFP4 apply input quantization
-        # in a single call with the MoE experts kernel.
-        defer_input_quant = use_deepseek_fp8_block_scale or use_nvfp4
-        return MoEPrepareAndFinalizeNoEP(defer_input_quant=defer_input_quant)
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -534,7 +534,13 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
+        defer_input_quant: bool = False,
    ) -> mk.PrepareResultType:
+        if defer_input_quant:
+            raise NotImplementedError(
+                f"{self.__class__.__name__} does not support defer_input_quant=True. "
+                "Please select an MoE kernel that accepts quantized inputs."
+            )
        assert a1.dim() == 2
        assert topk_ids.dim() == 2
        assert topk_ids.size(0) == a1.size(0)
@@ -1113,4 +1119,4 @@ class BatchedTritonExperts(mk.FusedMoEExpertsModular):
            config=config,
            per_act_token_quant=self.per_act_token_quant,
            block_shape=self.block_shape,
        )
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -164,7 +164,6 @@ def fused_moe_kernel_gptq_awq(
    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
    pid_n = (pid % num_pid_in_group) // group_size_m
    # ----------------------------------------------------------
    # Create pointers for the first blocks of A and B.
    # We will advance this pointer as we move in the K direction

--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -5,6 +5,7 @@ from abc import abstractmethod
 import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,