Commit eefa41c1 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.18.0

parent 82155c76
...@@ -74,6 +74,7 @@ class BaseFrontendArgs: ...@@ -74,6 +74,7 @@ class BaseFrontendArgs:
like SSL, CORS, and HTTP server settings. Those arguments are added by like SSL, CORS, and HTTP server settings. Those arguments are added by
the subclasses. the subclasses.
""" """
lora_modules: list[LoRAModulePath] | None = None lora_modules: list[LoRAModulePath] | None = None
"""LoRA modules configurations in either 'name=path' format or JSON format """LoRA modules configurations in either 'name=path' format or JSON format
or JSON list format. Example (old format): `'name=path'` Example (new or JSON list format. Example (old format): `'name=path'` Example (new
......
...@@ -216,6 +216,10 @@ def get_logits_processors( ...@@ -216,6 +216,10 @@ def get_logits_processors(
class FunctionCall(OpenAIBaseModel): class FunctionCall(OpenAIBaseModel):
# Internal field to preserve native tool call ID from tool parser.
# Excluded from serialization to maintain OpenAI API compatibility
# (function object should only contain 'name' and 'arguments').
id: str | None = Field(default=None, exclude=True)
name: str name: str
arguments: str arguments: str
...@@ -263,4 +267,4 @@ class GenerationError(Exception): ...@@ -263,4 +267,4 @@ class GenerationError(Exception):
def __init__(self, message: str = "Internal server error"): def __init__(self, message: str = "Internal server error"):
super().__init__(message) super().__init__(message)
self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
\ No newline at end of file
...@@ -282,7 +282,6 @@ def use_aot_compile() -> bool: ...@@ -282,7 +282,6 @@ def use_aot_compile() -> bool:
from vllm.model_executor.layers.batch_invariant import ( from vllm.model_executor.layers.batch_invariant import (
vllm_is_batch_invariant, vllm_is_batch_invariant,
) )
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.utils.torch_utils import is_torch_equal_or_newer
default_value = ( default_value = (
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.logging_utils.access_log_filter import (
UvicornAccessLogFilter,
create_uvicorn_log_config,
)
from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter
from vllm.logging_utils.lazy import lazy from vllm.logging_utils.lazy import lazy
from vllm.logging_utils.log_time import logtime from vllm.logging_utils.log_time import logtime
...@@ -8,6 +12,8 @@ from vllm.logging_utils.log_time import logtime ...@@ -8,6 +12,8 @@ from vllm.logging_utils.log_time import logtime
__all__ = [ __all__ = [
"NewLineFormatter", "NewLineFormatter",
"ColoredFormatter", "ColoredFormatter",
"UvicornAccessLogFilter",
"create_uvicorn_log_config",
"lazy", "lazy",
"logtime", "logtime",
] ]
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Access log filter for uvicorn to exclude specific endpoints from logging.
This module provides a logging filter that can be used to suppress access logs
for specific endpoints (e.g., /health, /metrics) to reduce log noise in
production environments.
"""
import logging
from urllib.parse import urlparse
class UvicornAccessLogFilter(logging.Filter):
"""
A logging filter that excludes access logs for specified endpoint paths.
This filter is designed to work with uvicorn's access logger. It checks
the log record's arguments for the request path and filters out records
matching the excluded paths.
Uvicorn access log format:
'%s - "%s %s HTTP/%s" %d'
(client_addr, method, path, http_version, status_code)
Example:
127.0.0.1:12345 - "GET /health HTTP/1.1" 200
Args:
excluded_paths: A list of URL paths to exclude from logging.
Paths are matched exactly.
Example: ["/health", "/metrics"]
"""
def __init__(self, excluded_paths: list[str] | None = None):
super().__init__()
self.excluded_paths = set(excluded_paths or [])
def filter(self, record: logging.LogRecord) -> bool:
"""
Determine if the log record should be logged.
Args:
record: The log record to evaluate.
Returns:
True if the record should be logged, False otherwise.
"""
if not self.excluded_paths:
return True
# This filter is specific to uvicorn's access logs.
if record.name != "uvicorn.access":
return True
# The path is the 3rd argument in the log record's args tuple.
# See uvicorn's access logging implementation for details.
log_args = record.args
if isinstance(log_args, tuple) and len(log_args) >= 3:
path_with_query = log_args[2]
# Get path component without query string.
if isinstance(path_with_query, str):
path = urlparse(path_with_query).path
if path in self.excluded_paths:
return False
return True
def create_uvicorn_log_config(
excluded_paths: list[str] | None = None,
log_level: str = "info",
) -> dict:
"""
Create a uvicorn logging configuration with access log filtering.
This function generates a logging configuration dictionary that can be
passed to uvicorn's `log_config` parameter. It sets up the access log
filter to exclude specified paths.
Args:
excluded_paths: List of URL paths to exclude from access logs.
log_level: The log level for uvicorn loggers.
Returns:
A dictionary containing the logging configuration.
Example:
>>> config = create_uvicorn_log_config(["/health", "/metrics"])
>>> uvicorn.run(app, log_config=config)
"""
config = {
"version": 1,
"disable_existing_loggers": False,
"filters": {
"access_log_filter": {
"()": UvicornAccessLogFilter,
"excluded_paths": excluded_paths or [],
},
},
"formatters": {
"default": {
"()": "uvicorn.logging.DefaultFormatter",
"fmt": "%(levelprefix)s %(message)s",
"use_colors": None,
},
"access": {
"()": "uvicorn.logging.AccessFormatter",
"fmt": '%(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s', # noqa: E501
},
},
"handlers": {
"default": {
"formatter": "default",
"class": "logging.StreamHandler",
"stream": "ext://sys.stderr",
},
"access": {
"formatter": "access",
"class": "logging.StreamHandler",
"stream": "ext://sys.stdout",
"filters": ["access_log_filter"],
},
},
"loggers": {
"uvicorn": {
"handlers": ["default"],
"level": log_level.upper(),
"propagate": False,
},
"uvicorn.error": {
"level": log_level.upper(),
"handlers": ["default"],
"propagate": False,
},
"uvicorn.access": {
"handlers": ["access"],
"level": log_level.upper(),
"propagate": False,
},
},
}
return config
\ No newline at end of file
...@@ -175,6 +175,7 @@ def _fused_moe_lora_kernel( ...@@ -175,6 +175,7 @@ def _fused_moe_lora_kernel(
top_k_num, top_k_num,
lora_ids, lora_ids,
adapter_enabled, adapter_enabled,
max_loras, # <<< PR2: rename, used for masks when grid axis-2 != max_loras
# The stride variables represent how much to increase the ptr by when # The stride variables represent how much to increase the ptr by when
# moving by 1 element in a particular dimension. E.g. `stride_am` is # moving by 1 element in a particular dimension. E.g. `stride_am` is
# how much to increase `a_ptr` by to get the element one row down # how much to increase `a_ptr` by to get the element one row down
...@@ -225,7 +226,6 @@ def _fused_moe_lora_kernel( ...@@ -225,7 +226,6 @@ def _fused_moe_lora_kernel(
): ):
pid = tl.program_id(axis=0) pid = tl.program_id(axis=0)
slice_id = tl.program_id(axis=1) slice_id = tl.program_id(axis=1)
grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K) grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)
# calculate pid_m,pid_n # calculate pid_m,pid_n
...@@ -296,7 +296,6 @@ def _fused_moe_lora_kernel( ...@@ -296,7 +296,6 @@ def _fused_moe_lora_kernel(
cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty)) cur_b_ptr = tl.load(b_ptr + slice_id).to(tl.pointer_type(c_ptr.dtype.element_ty))
cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size
offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
token_mask = offs_token < num_valid_tokens token_mask = offs_token < num_valid_tokens
...@@ -504,6 +503,7 @@ def _fused_moe_lora_shrink( ...@@ -504,6 +503,7 @@ def _fused_moe_lora_shrink(
top_k_num, top_k_num,
lora_ids, lora_ids,
adapter_enabled, adapter_enabled,
lora_a_stacked[0].shape[0],
qcurr_hidden_states.stride(0), qcurr_hidden_states.stride(0),
qcurr_hidden_states.stride(1), qcurr_hidden_states.stride(1),
w1_lora_a_stacked.stride(0), w1_lora_a_stacked.stride(0),
...@@ -639,6 +639,7 @@ def _fused_moe_lora_expand( ...@@ -639,6 +639,7 @@ def _fused_moe_lora_expand(
top_k_num, top_k_num,
lora_ids, lora_ids,
adapter_enabled, adapter_enabled,
lora_b_stacked[0].shape[0],
a_intermediate_cache1.stride(0), a_intermediate_cache1.stride(0),
a_intermediate_cache1.stride(1), a_intermediate_cache1.stride(1),
w1_lora_b_stacked.stride(0), w1_lora_b_stacked.stride(0),
......
...@@ -82,7 +82,6 @@ from vllm.model_executor.kernels.linear.scaled_mm.rocm import ( ...@@ -82,7 +82,6 @@ from vllm.model_executor.kernels.linear.scaled_mm.rocm import (
from vllm.model_executor.kernels.linear.scaled_mm.triton import ( from vllm.model_executor.kernels.linear.scaled_mm.triton import (
TritonInt8ScaledMMLinearKernel, TritonInt8ScaledMMLinearKernel,
) )
from vllm.model_executor.kernels.linear.scaled_mm.xpu import ( from vllm.model_executor.kernels.linear.scaled_mm.xpu import (
XPUFP8ScaledMMLinearKernel, XPUFP8ScaledMMLinearKernel,
) )
......
...@@ -29,7 +29,6 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl( ...@@ -29,7 +29,6 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl(
and B.shape[0] % 16 == 0 # M TODO: needed? and B.shape[0] % 16 == 0 # M TODO: needed?
and B.shape[1] % 16 == 0 # K and B.shape[1] % 16 == 0 # K
and ((bias is None) or (bias.dtype == out_dtype)) and ((bias is None) or (bias.dtype == out_dtype))
and A.is_contiguous()
): ):
output = ops.wvSplitKQ( output = ops.wvSplitKQ(
B.t(), B.t(),
...@@ -116,4 +115,4 @@ class ROCmFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel): ...@@ -116,4 +115,4 @@ class ROCmFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
output = torch.ops.vllm.rocm_per_tensor_float_w8a8_scaled_mm_impl( output = torch.ops.vllm.rocm_per_tensor_float_w8a8_scaled_mm_impl(
A, B, out_dtype, As, Bs, bias A, B, out_dtype, As, Bs, bias
) )
return torch.narrow(output, 0, 0, A.shape[0]).view(*output_shape) return torch.narrow(output, 0, 0, A.shape[0]).view(*output_shape)
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import torch
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501
FP8ScaledMMLinearKernel,
FP8ScaledMMLinearLayerConfig,
)
from vllm.platforms import current_platform
class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
@classmethod
def is_supported(
cls, compute_capability: int | None = None
) -> tuple[bool, str | None]:
if not current_platform.is_xpu():
return False, "XPUFP8ScaledMM only support on XPU"
return True, None
@classmethod
def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
if c.weight_quant_key.dtype not in {torch.float8_e5m2, torch.float8_e4m3fn}:
return False, "XPUFP8ScaledMM only support FP8 weight dtype"
return True, None
def __init__(
self, c: FP8ScaledMMLinearLayerConfig, layer_param_names: Sequence[str]
) -> None:
assert self.can_implement(c)[0]
assert self.is_supported()[0]
self.config = c
self.layer_param_names = layer_param_names
def apply_weights(
self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: torch.Tensor | None = None,
) -> torch.Tensor:
weight = layer.weight
weight_scale = layer.weight_scale
return torch.ops._xpu_C.fp8_gemm_w8a16(x, weight, weight_scale, bias)
def apply_scaled_mm(
self,
*,
A: torch.Tensor,
B: torch.Tensor,
out_dtype: torch.dtype,
As: torch.Tensor,
Bs: torch.Tensor,
bias: torch.Tensor | None,
output_shape: list,
) -> torch.Tensor:
pass
...@@ -2291,12 +2291,14 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): ...@@ -2291,12 +2291,14 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v
): ):
assert isinstance(prefill, FlashInferPrefillMetadata) assert isinstance(prefill, FlashInferPrefillMetadata)
attn_out, lse = prefill.prefill_chunks[chunk_idx].run( attn_out, lse = prefill.prefill_chunks[chunk_idx].run(
q=q, q=q,
k=k, k=k,
v=v, v=v,
return_lse=True, return_lse=True,
) )
# Convert from (q_len, num_heads) to (num_heads, q_len) # Convert from (q_len, num_heads) to (num_heads, q_len)
return attn_out, lse.transpose(0, 1).contiguous() return attn_out, lse.transpose(0, 1).contiguous()
...@@ -2730,4 +2732,4 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): ...@@ -2730,4 +2732,4 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
attn_metadata: M, attn_metadata: M,
layer: AttentionLayer, layer: AttentionLayer,
) -> tuple[torch.Tensor, torch.Tensor | None]: ) -> tuple[torch.Tensor, torch.Tensor | None]:
raise NotImplementedError raise NotImplementedError
\ No newline at end of file
...@@ -9,6 +9,7 @@ from vllm.config import get_current_vllm_config ...@@ -9,6 +9,7 @@ from vllm.config import get_current_vllm_config
from vllm.distributed import ( from vllm.distributed import (
get_ep_group, get_ep_group,
) )
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig, FusedMoEConfig,
FusedMoEParallelConfig, FusedMoEParallelConfig,
...@@ -30,6 +31,8 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import ( ...@@ -30,6 +31,8 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.import_utils import has_deep_ep, has_mori, has_nixl_ep from vllm.utils.import_utils import has_deep_ep, has_mori, has_nixl_ep
logger = init_logger(__name__)
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
if has_deep_ep(): if has_deep_ep():
from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
...@@ -90,6 +93,20 @@ def maybe_make_prepare_finalize( ...@@ -90,6 +93,20 @@ def maybe_make_prepare_finalize(
allow_new_interface: bool = False, allow_new_interface: bool = False,
use_monolithic: bool = False, use_monolithic: bool = False,
) -> FusedMoEPrepareAndFinalize | None: ) -> FusedMoEPrepareAndFinalize | None:
# NOTE(rob): we are migrating each quant_method to hold the MK
# in all cases. The allow_new_interface=False flag allow us to fall
# back to the old method for methods that have not yet been migrated.
#
# In old method:
# * maybe_init_modular_kernel() calls this function. If we are
# using no Dp/Ep or naive all2all, we return None this function
# returns None and no ModularKernelMethod is created. If non-naive
# all2all is used, this returns a PrepareAndFinalize object and
# a ModularKernelMethod is created.
# In new method:
# * maybe_make_prepare_finalize() is called from the oracle. We
# always return a PrepareAndFinalize object and the quant method
# holds the ModularKernel.
if not moe.moe_parallel_config.use_all2all_kernels: if not moe.moe_parallel_config.use_all2all_kernels:
if not allow_new_interface: if not allow_new_interface:
return None return None
...@@ -114,11 +131,6 @@ def maybe_make_prepare_finalize( ...@@ -114,11 +131,6 @@ def maybe_make_prepare_finalize(
assert all2all_manager is not None assert all2all_manager is not None
prepare_finalize: FusedMoEPrepareAndFinalize | None = None prepare_finalize: FusedMoEPrepareAndFinalize | None = None
# TODO(rob): update this as part of the MoE refactor.
assert not moe.use_flashinfer_cutlass_kernels, (
"Must be created in modelopt.py or fp8.py"
)
if moe.use_deepep_ht_kernels: if moe.use_deepep_ht_kernels:
assert moe.dp_size == all2all_manager.dp_world_size assert moe.dp_size == all2all_manager.dp_world_size
...@@ -258,4 +270,5 @@ def maybe_make_prepare_finalize( ...@@ -258,4 +270,5 @@ def maybe_make_prepare_finalize(
physical_to_global=physical_to_global, physical_to_global=physical_to_global,
local_expert_global_ids=local_expert_global_ids, local_expert_global_ids=local_expert_global_ids,
) )
return prepare_finalize
return prepare_finalize
\ No newline at end of file
...@@ -17,7 +17,6 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( ...@@ -17,7 +17,6 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
) )
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.import_utils import has_triton_kernels from vllm.utils.import_utils import has_triton_kernels
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
...@@ -1262,4 +1261,4 @@ class FusedMoEConfig: ...@@ -1262,4 +1261,4 @@ class FusedMoEConfig:
@property @property
def use_nixl_ep_kernels(self): def use_nixl_ep_kernels(self):
return self.moe_parallel_config.use_nixl_ep_kernels return self.moe_parallel_config.use_nixl_ep_kernels
\ No newline at end of file
...@@ -107,7 +107,13 @@ def run_cutlass_moe_fp8( ...@@ -107,7 +107,13 @@ def run_cutlass_moe_fp8(
), "Intermediate scale shape mismatch" ), "Intermediate scale shape mismatch"
assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype" assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
if expert_map is not None: # NOTE(rob): the expert_map is used for the STANDARD case and
# the batched format is used by the BATCHED case.
# TODO(rob): update the MK interface to only pass the expert_map
# during the STANDARD case to make this clearer across all kernels.
if use_batched_format:
assert expert_num_tokens is not None
else:
assert expert_num_tokens is None assert expert_num_tokens is None
# We have two modes: batched experts and non-batched experts. # We have two modes: batched experts and non-batched experts.
...@@ -1188,4 +1194,4 @@ def cutlass_moe_w4a8_fp8( ...@@ -1188,4 +1194,4 @@ def cutlass_moe_w4a8_fp8(
global_num_experts=num_experts, global_num_experts=num_experts,
expert_map=expert_map, expert_map=expert_map,
apply_router_weight_on_input=apply_router_weight_on_input, apply_router_weight_on_input=apply_router_weight_on_input,
) )
\ No newline at end of file
...@@ -103,6 +103,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -103,6 +103,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
num_experts: int, num_experts: int,
a1_scale: torch.Tensor | None, a1_scale: torch.Tensor | None,
quant_config: FusedMoEQuantConfig, quant_config: FusedMoEQuantConfig,
defer_input_quant: bool,
) -> Callable: ) -> Callable:
has_scales = token_scales is not None has_scales = token_scales is not None
...@@ -174,6 +175,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -174,6 +175,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_topk_weights, expert_topk_weights,
a1_scale, a1_scale,
quant_config, quant_config,
defer_input_quant=defer_input_quant,
) )
def _receiver( def _receiver(
...@@ -187,6 +189,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -187,6 +189,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_topk_weights: torch.Tensor | None, expert_topk_weights: torch.Tensor | None,
a1_scale: torch.Tensor | None, a1_scale: torch.Tensor | None,
quant_config: FusedMoEQuantConfig, quant_config: FusedMoEQuantConfig,
defer_input_quant: bool,
) -> mk.PrepareResultType: ) -> mk.PrepareResultType:
if event.event is not None: if event.event is not None:
event.current_stream_wait() event.current_stream_wait()
...@@ -221,14 +224,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -221,14 +224,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_num_tokens_per_expert_list, device=expert_x.device expert_num_tokens_per_expert_list, device=expert_x.device
) )
# Dispatch and Quant # * For non-block quant, dispatch in b16 and quantize now as
# DeepEP kernels only support dispatching block-quantized # DeepEP kernels only support dispatching block scales.
# activation scales. # * For expert kernels that require unquantized inputs,
# Dispatch in bfloat16 and quantize afterwards # defer quantization to FusedMoEExpertsPermuteUnpermute.
if not quant_config.is_block_quantized: if not quant_config.is_block_quantized and not defer_input_quant:
# Quantize after dispatch. # Quantize after dispatch.
expert_x_scale = None expert_x_scale = None
if expert_x.numel() != 0: if expert_x.numel() != 0:
# TODO: support per_act_token_quant,
expert_x, expert_x_scale = moe_kernel_quantize_input( expert_x, expert_x_scale = moe_kernel_quantize_input(
expert_x, expert_x,
a1_scale, a1_scale,
...@@ -258,6 +262,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -258,6 +262,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map: torch.Tensor | None, expert_map: torch.Tensor | None,
apply_router_weight_on_input: bool, apply_router_weight_on_input: bool,
quant_config: FusedMoEQuantConfig, quant_config: FusedMoEQuantConfig,
defer_input_quant: bool = False,
) -> mk.ReceiverType: ) -> mk.ReceiverType:
if apply_router_weight_on_input: if apply_router_weight_on_input:
topk = topk_ids.size(1) topk = topk_ids.size(1)
...@@ -267,8 +272,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -267,8 +272,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
) )
a1 = a1 * topk_weights.to(a1.dtype) a1 = a1 * topk_weights.to(a1.dtype)
if quant_config.is_block_quantized: # * DeepEP only supports fp8 block scales so quantize
# Quant and Dispatch # before the dispatch for these models.
# * For all other quantization, dispatch after.
# * For expert kernels that require unquantized inputs,
# defer quantization to FusedMoEExpertsPermuteUnpermute.
if quant_config.is_block_quantized and not defer_input_quant:
a1q, a1q_scale = moe_kernel_quantize_input( a1q, a1q_scale = moe_kernel_quantize_input(
a1, a1,
quant_config.a1_scale, quant_config.a1_scale,
...@@ -282,7 +291,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -282,7 +291,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
else: else:
a1q = a1 a1q = a1
a1q_scale = None a1q_scale = None
a1_post_scale = quant_config.a1_scale a1_post_scale = (
quant_config.a1_gscale
if quant_config.quant_dtype == "nvfp4"
else quant_config.a1_scale
)
return self._do_dispatch( return self._do_dispatch(
tokens=a1q, tokens=a1q,
...@@ -292,6 +305,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -292,6 +305,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
num_experts=num_experts, num_experts=num_experts,
a1_scale=a1_post_scale, a1_scale=a1_post_scale,
quant_config=quant_config, quant_config=quant_config,
defer_input_quant=defer_input_quant,
) )
def prepare( def prepare(
...@@ -303,6 +317,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -303,6 +317,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map: torch.Tensor | None, expert_map: torch.Tensor | None,
apply_router_weight_on_input: bool, apply_router_weight_on_input: bool,
quant_config: FusedMoEQuantConfig, quant_config: FusedMoEQuantConfig,
defer_input_quant: bool = False,
) -> mk.PrepareResultType: ) -> mk.PrepareResultType:
receiver = self.prepare_async( receiver = self.prepare_async(
a1, a1,
...@@ -312,6 +327,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -312,6 +327,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map, expert_map,
apply_router_weight_on_input, apply_router_weight_on_input,
quant_config, quant_config,
defer_input_quant,
) )
return receiver() return receiver()
...@@ -418,4 +434,4 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -418,4 +434,4 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
apply_router_weight_on_input, apply_router_weight_on_input,
weight_and_reduce_impl, weight_and_reduce_impl,
False, False,
) )
\ No newline at end of file
...@@ -241,7 +241,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -241,7 +241,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map: torch.Tensor | None, expert_map: torch.Tensor | None,
apply_router_weight_on_input: bool, apply_router_weight_on_input: bool,
quant_config: FusedMoEQuantConfig, quant_config: FusedMoEQuantConfig,
defer_input_quant: bool = False,
) -> tuple[Callable, mk.ReceiverType]: ) -> tuple[Callable, mk.ReceiverType]:
if defer_input_quant:
raise NotImplementedError(
f"{self.__class__.__name__} does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
hidden_size = a1.size(1) hidden_size = a1.size(1)
assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, ( assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, (
...@@ -344,7 +350,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -344,7 +350,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map: torch.Tensor | None, expert_map: torch.Tensor | None,
apply_router_weight_on_input: bool, apply_router_weight_on_input: bool,
quant_config: FusedMoEQuantConfig, quant_config: FusedMoEQuantConfig,
defer_input_quant: bool = False,
) -> mk.PrepareResultType: ) -> mk.PrepareResultType:
if defer_input_quant:
raise NotImplementedError(
f"{self.__class__.__name__} does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
hook, receiver = self.prepare_async( hook, receiver = self.prepare_async(
a1, a1,
topk_weights, topk_weights,
...@@ -433,4 +445,4 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -433,4 +445,4 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
apply_router_weight_on_input, apply_router_weight_on_input,
weight_and_reduce_impl, weight_and_reduce_impl,
do_async=False, do_async=False,
) )
\ No newline at end of file
...@@ -117,16 +117,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular): ...@@ -117,16 +117,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
dtype=torch.float32, dtype=torch.float32,
) )
@staticmethod @property
def expects_unquantized_inputs( def expects_unquantized_inputs(self) -> bool:
moe_config: mk.FusedMoEConfig, quant_config: FusedMoEQuantConfig return self.quant_config.use_fp8_w8a8 and self.quant_config.is_block_quantized
) -> bool:
# NVFP4 TP kernels and FP8 block-quantized kernels apply
# input quantization inside FusedMoEPermuteExpertsUnpermute.
return (
quant_config.use_nvfp4_w4a4
and not moe_config.moe_parallel_config.use_all2all_kernels
) or (quant_config.use_fp8_w8a8 and quant_config.is_block_quantized)
@staticmethod @staticmethod
def _supports_current_device() -> bool: def _supports_current_device() -> bool:
...@@ -241,9 +234,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular): ...@@ -241,9 +234,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
""" """
workspace1 = (M, K) workspace1 = (M, K)
workspace2 = (0,) workspace2 = (0,)
# For NVFP4, the output is stored in a packed int8 format,
# For TP, the quantization is fused with fused_moe call. # so the actual hidden dim is 2x the size of K here.
output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" and self.use_dp else K) output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" else K)
# The workspace is determined by `aq`, since it comes after any # The workspace is determined by `aq`, since it comes after any
# potential communication op and is involved in the expert computation. # potential communication op and is involved in the expert computation.
return (workspace1, workspace2, output_shape) return (workspace1, workspace2, output_shape)
...@@ -403,4 +396,4 @@ class FlashInferExperts(mk.FusedMoEExpertsModular): ...@@ -403,4 +396,4 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None: def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
# No support for LoRA in flashinfer_cutlass_fused_moe. # No support for LoRA in flashinfer_cutlass_fused_moe.
# See TODOs in flashinfer functions runMoe and runMoeMinLatency. # See TODOs in flashinfer functions runMoe and runMoeMinLatency.
raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe") raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe")
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.distributed import get_dp_group, get_ep_group
from vllm.distributed.device_communicators.base_device_communicator import (
All2AllManagerBase,
)
from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceNoOP,
)
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
from vllm.utils.flashinfer import nvfp4_block_scale_interleave
def get_local_sizes():
return get_forward_context().dp_metadata.get_chunk_sizes_across_dp_rank()
class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
"""Base class for FlashInfer MoE prepare and finalize operations."""
def __init__(
self,
use_dp: bool,
num_dispatchers: int = 1,
use_deepseek_fp8_block_scale: bool = False,
):
super().__init__()
self.num_dispatchers_ = num_dispatchers
self.use_dp = use_dp
self.local_tokens = None
# Toggle for DeepSeek-style FP8 block-scale path where activations are
# not quantized here and weight block scales are consumed by the kernel.
self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
@property
def activation_format(self) -> mk.FusedMoEActivationFormat:
return mk.FusedMoEActivationFormat.Standard
def max_num_tokens_per_rank(self) -> int | None:
return None
def topk_indices_dtype(self) -> torch.dtype | None:
return None
def num_dispatchers(self) -> int:
return self.num_dispatchers_
def output_is_reduced(self) -> bool:
return False
def _apply_router_weight_on_input(
self,
a1: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
apply_router_weight_on_input: bool,
) -> None:
"""Apply router weight on input if needed."""
if apply_router_weight_on_input:
topk = topk_ids.size(1)
assert topk == 1, (
"apply_router_weight_on_input is only implemented for topk=1"
)
a1.mul_(topk_weights.to(a1.dtype))
class FlashInferAllToAllMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
"""FlashInfer implementation using AllToAll communication."""
def __init__(
self,
use_dp: bool,
num_dispatchers: int = 1,
use_deepseek_fp8_block_scale: bool = False,
):
super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
self.alltoall_info = None
# Initialize all2all_manager only for DP case
self.all2all_manager = None
if self.use_dp:
self.all2all_manager = get_ep_group().device_communicator.all2all_manager
def prepare(
self,
a1: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
num_experts: int,
expert_map: torch.Tensor | None,
apply_router_weight_on_input: bool,
quant_config: FusedMoEQuantConfig,
) -> mk.PrepareResultType:
self._apply_router_weight_on_input(
a1, topk_weights, topk_ids, apply_router_weight_on_input
)
if not self.use_dp:
# Non-DP case: quantize activations unless using block-scale path
if not self.use_deepseek_fp8_block_scale:
a1q, a1q_scale = moe_kernel_quantize_input(
a1,
quant_config.a1_gscale,
quant_config.quant_dtype,
quant_config.per_act_token_quant,
quant_config.block_shape,
is_fp4_scale_swizzled=not self.use_dp,
)
else:
a1q = a1
a1q_scale = None
else:
# DP case: use FlashInfer AllToAll
global_num_tokens_cpu = get_local_sizes()
top_k = topk_ids.size(1)
(self.alltoall_info, topk_ids, topk_weights, a1q, a1q_scale) = (
flashinfer_alltoall_dispatch(
self.all2all_manager,
global_num_tokens_cpu,
a1,
quant_config.a1_gscale,
topk_ids,
topk_weights,
top_k,
num_experts,
quant_config,
use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
)
)
return a1q, a1q_scale, None, topk_ids, topk_weights
def finalize(
self,
output: torch.Tensor,
fused_expert_output: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
apply_router_weight_on_input: bool,
weight_and_reduce_impl: mk.TopKWeightAndReduce,
) -> None:
if self.use_dp:
top_k = topk_ids.size(1)
token_count = output.shape[0]
fused_expert_output = flashinfer_alltoall_combine(
self.all2all_manager,
fused_expert_output,
top_k=top_k,
token_count=token_count,
alltoall_info=self.alltoall_info,
)
output.copy_(fused_expert_output)
class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFinalize):
def __init__(
self,
use_dp: bool,
num_dispatchers: int = 1,
use_deepseek_fp8_block_scale: bool = False,
):
super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
def prepare(
self,
a1: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
num_experts: int,
expert_map: torch.Tensor | None,
apply_router_weight_on_input: bool,
quant_config: FusedMoEQuantConfig,
) -> mk.PrepareResultType:
self._apply_router_weight_on_input(
a1, topk_weights, topk_ids, apply_router_weight_on_input
)
is_nvfp4 = quant_config.quant_dtype == "nvfp4"
if not self.use_dp and is_nvfp4:
return a1, None, None, topk_ids, topk_weights
if not self.use_deepseek_fp8_block_scale:
a1q, a1q_scale = moe_kernel_quantize_input(
a1,
quant_config.a1_gscale if is_nvfp4 else quant_config.a1_scale,
quant_config.quant_dtype,
quant_config.per_act_token_quant,
quant_config.block_shape,
is_fp4_scale_swizzled=not self.use_dp,
)
else:
# Block-scale path: pass activations through, omit per-token scales
a1q = a1
a1q_scale = None
if self.use_dp:
# Build gather list conditionally - omit a1q_scale if None
# (block-scale path)
gather_list = [topk_weights, topk_ids, a1q]
if a1q_scale is not None:
gather_list.append(a1q_scale)
gathered = get_dp_group().all_gatherv(
gather_list,
dim=0,
sizes=get_local_sizes(),
)
topk_weights, topk_ids, a1q, a1q_scale = gathered
else:
gathered = get_dp_group().all_gatherv(
gather_list,
dim=0,
sizes=get_local_sizes(),
)
topk_weights, topk_ids, a1q = gathered
a1q_scale = None
if is_nvfp4 and a1q_scale is not None:
if a1q_scale.element_size() == 1:
a1q_scale = a1q_scale.view(torch.uint8)
a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
return a1q, a1q_scale, None, topk_ids, topk_weights
def finalize(
self,
output: torch.Tensor,
fused_expert_output: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
apply_router_weight_on_input: bool,
weight_and_reduce_impl: mk.TopKWeightAndReduce,
) -> None:
assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceNoOP)
if self.use_dp:
fused_expert_output = get_dp_group().reduce_scatterv(
fused_expert_output, dim=0, sizes=get_local_sizes()
)
output.copy_(fused_expert_output)
def flashinfer_alltoall_dispatch(
all2all_manager: All2AllManagerBase,
global_num_tokens_cpu: list[int],
x: torch.Tensor,
gs: torch.Tensor,
topk_ids: torch.Tensor,
topk_weights: torch.Tensor,
top_k: int,
num_experts: int,
quant_config: FusedMoEQuantConfig,
use_deepseek_fp8_block_scale: bool = False,
):
from flashinfer.comm.trtllm_alltoall import MnnvlMoe
assert all2all_manager.ensure_alltoall_workspace_initialized(), (
"FlashInfer AllToAll workspace not available"
)
ep_rank = all2all_manager.rank
ep_size = all2all_manager.world_size
max_num_token = (
max(global_num_tokens_cpu) if global_num_tokens_cpu is not None else x.shape[0]
)
orig_topk_weights_dtype = topk_weights.dtype
alltoall_info, topk_ids, topk_weights, _ = (
MnnvlMoe.mnnvl_moe_alltoallv_prepare_without_allgather(
topk_ids,
topk_weights,
None,
all2all_manager.prepare_workspace_tensor,
max_num_token,
ep_rank,
ep_size,
num_experts,
num_experts,
top_k,
)
)
topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype)
if not use_deepseek_fp8_block_scale:
x, x_sf = moe_kernel_quantize_input(
x,
gs,
quant_config.quant_dtype,
quant_config.per_act_token_quant,
quant_config.block_shape,
is_fp4_scale_swizzled=False, # delay swizzle to after comm
)
x = MnnvlMoe.mnnvl_moe_alltoallv(
x,
alltoall_info,
all2all_manager.workspace_tensor,
ep_rank,
ep_size,
)
x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
x_sf,
alltoall_info,
all2all_manager.workspace_tensor,
ep_rank,
ep_size,
)
if quant_config.quant_dtype == "nvfp4":
x_sf = nvfp4_block_scale_interleave(x_sf)
else:
# Block-scale path: pass activations through without quantization
x_sf = None
x = MnnvlMoe.mnnvl_moe_alltoallv(
x,
alltoall_info,
all2all_manager.workspace_tensor,
ep_rank,
ep_size,
)
return alltoall_info, topk_ids, topk_weights, x, x_sf
def flashinfer_alltoall_combine(
all2all_manager: All2AllManagerBase,
output: torch.Tensor,
top_k: int,
token_count: int,
alltoall_info,
):
from flashinfer.comm.trtllm_alltoall import MnnvlMoe
assert all2all_manager.ensure_alltoall_workspace_initialized(), (
"FlashInfer AllToAll workspace not available"
)
return MnnvlMoe.mnnvl_moe_alltoallv_combine(
output,
alltoall_info,
all2all_manager.workspace_tensor,
ep_rank=all2all_manager.rank,
ep_size=all2all_manager.world_size,
top_k=top_k,
token_count=token_count,
)
def create_flashinfer_prepare_finalize(
use_dp: bool,
use_nvfp4: bool = False,
enable_alltoallv: bool = False,
use_deepseek_fp8_block_scale: bool = False,
) -> FlashInferCutlassMoEPrepareAndFinalize | MoEPrepareAndFinalizeNoEP:
"""Factory function to create the appropriate FlashInfer implementation."""
if use_dp:
if enable_alltoallv:
assert use_nvfp4
return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
return FlashInferAllGatherMoEPrepareAndFinalize(
use_dp=True,
use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
)
else:
# CUTLASS FP8 BLOCK and CUTLASS NVFP4 apply input quantization
# in a single call with the MoE experts kernel.
defer_input_quant = use_deepseek_fp8_block_scale or use_nvfp4
return MoEPrepareAndFinalizeNoEP(defer_input_quant=defer_input_quant)
\ No newline at end of file
...@@ -534,7 +534,13 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): ...@@ -534,7 +534,13 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map: torch.Tensor | None, expert_map: torch.Tensor | None,
apply_router_weight_on_input: bool, apply_router_weight_on_input: bool,
quant_config: FusedMoEQuantConfig, quant_config: FusedMoEQuantConfig,
defer_input_quant: bool = False,
) -> mk.PrepareResultType: ) -> mk.PrepareResultType:
if defer_input_quant:
raise NotImplementedError(
f"{self.__class__.__name__} does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
assert a1.dim() == 2 assert a1.dim() == 2
assert topk_ids.dim() == 2 assert topk_ids.dim() == 2
assert topk_ids.size(0) == a1.size(0) assert topk_ids.size(0) == a1.size(0)
...@@ -1113,4 +1119,4 @@ class BatchedTritonExperts(mk.FusedMoEExpertsModular): ...@@ -1113,4 +1119,4 @@ class BatchedTritonExperts(mk.FusedMoEExpertsModular):
config=config, config=config,
per_act_token_quant=self.per_act_token_quant, per_act_token_quant=self.per_act_token_quant,
block_shape=self.block_shape, block_shape=self.block_shape,
) )
\ No newline at end of file
...@@ -164,7 +164,6 @@ def fused_moe_kernel_gptq_awq( ...@@ -164,7 +164,6 @@ def fused_moe_kernel_gptq_awq(
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
pid_n = (pid % num_pid_in_group) // group_size_m pid_n = (pid % num_pid_in_group) // group_size_m
# ---------------------------------------------------------- # ----------------------------------------------------------
# Create pointers for the first blocks of A and B. # Create pointers for the first blocks of A and B.
# We will advance this pointer as we move in the K direction # We will advance this pointer as we move in the K direction
......
...@@ -5,6 +5,7 @@ from abc import abstractmethod ...@@ -5,6 +5,7 @@ from abc import abstractmethod
import torch import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig, FusedMoEConfig,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment