Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
eefa41c1
Commit
eefa41c1
authored
Mar 24, 2026
by
zhuwenwen
Browse files
sync v0.18.0
parent
82155c76
Changes
253
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
245 additions
and
477 deletions
+245
-477
vllm/entrypoints/openai/cli_args.py
vllm/entrypoints/openai/cli_args.py
+1
-0
vllm/entrypoints/openai/engine/protocol.py
vllm/entrypoints/openai/engine/protocol.py
+5
-1
vllm/envs.py
vllm/envs.py
+0
-1
vllm/logging_utils/__init__.py
vllm/logging_utils/__init__.py
+7
-1
vllm/logging_utils/access_log_filter.py
vllm/logging_utils/access_log_filter.py
+144
-0
vllm/lora/ops/triton_ops/fused_moe_lora_op.py
vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+3
-2
vllm/model_executor/kernels/linear/__init__.py
vllm/model_executor/kernels/linear/__init__.py
+0
-1
vllm/model_executor/kernels/linear/scaled_mm/rocm.py
vllm/model_executor/kernels/linear/scaled_mm/rocm.py
+1
-2
vllm/model_executor/kernels/linear/xpu.py
vllm/model_executor/kernels/linear/xpu.py
+0
-59
vllm/model_executor/layers/attention/mla_attention.py
vllm/model_executor/layers/attention/mla_attention.py
+3
-1
vllm/model_executor/layers/fused_moe/all2all_utils.py
vllm/model_executor/layers/fused_moe/all2all_utils.py
+19
-6
vllm/model_executor/layers/fused_moe/config.py
vllm/model_executor/layers/fused_moe/config.py
+1
-2
vllm/model_executor/layers/fused_moe/cutlass_moe.py
vllm/model_executor/layers/fused_moe/cutlass_moe.py
+8
-2
vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
...l_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+25
-9
vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
...l_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+13
-1
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
...model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+7
-14
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
...r/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+0
-373
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+7
-1
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+0
-1
vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
.../model_executor/layers/fused_moe/fused_moe_method_base.py
+1
-0
No files found.
vllm/entrypoints/openai/cli_args.py
View file @
eefa41c1
...
@@ -74,6 +74,7 @@ class BaseFrontendArgs:
...
@@ -74,6 +74,7 @@ class BaseFrontendArgs:
like SSL, CORS, and HTTP server settings. Those arguments are added by
like SSL, CORS, and HTTP server settings. Those arguments are added by
the subclasses.
the subclasses.
"""
"""
lora_modules
:
list
[
LoRAModulePath
]
|
None
=
None
lora_modules
:
list
[
LoRAModulePath
]
|
None
=
None
"""LoRA modules configurations in either 'name=path' format or JSON format
"""LoRA modules configurations in either 'name=path' format or JSON format
or JSON list format. Example (old format): `'name=path'` Example (new
or JSON list format. Example (old format): `'name=path'` Example (new
...
...
vllm/entrypoints/openai/engine/protocol.py
View file @
eefa41c1
...
@@ -216,6 +216,10 @@ def get_logits_processors(
...
@@ -216,6 +216,10 @@ def get_logits_processors(
class
FunctionCall
(
OpenAIBaseModel
):
class
FunctionCall
(
OpenAIBaseModel
):
# Internal field to preserve native tool call ID from tool parser.
# Excluded from serialization to maintain OpenAI API compatibility
# (function object should only contain 'name' and 'arguments').
id
:
str
|
None
=
Field
(
default
=
None
,
exclude
=
True
)
name
:
str
name
:
str
arguments
:
str
arguments
:
str
...
...
vllm/envs.py
View file @
eefa41c1
...
@@ -282,7 +282,6 @@ def use_aot_compile() -> bool:
...
@@ -282,7 +282,6 @@ def use_aot_compile() -> bool:
from
vllm.model_executor.layers.batch_invariant
import
(
from
vllm.model_executor.layers.batch_invariant
import
(
vllm_is_batch_invariant
,
vllm_is_batch_invariant
,
)
)
from
vllm.platforms
import
current_platform
from
vllm.utils.torch_utils
import
is_torch_equal_or_newer
from
vllm.utils.torch_utils
import
is_torch_equal_or_newer
default_value
=
(
default_value
=
(
...
...
vllm/logging_utils/__init__.py
View file @
eefa41c1
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.logging_utils.access_log_filter
import
(
UvicornAccessLogFilter
,
create_uvicorn_log_config
,
)
from
vllm.logging_utils.formatter
import
ColoredFormatter
,
NewLineFormatter
from
vllm.logging_utils.formatter
import
ColoredFormatter
,
NewLineFormatter
from
vllm.logging_utils.lazy
import
lazy
from
vllm.logging_utils.lazy
import
lazy
from
vllm.logging_utils.log_time
import
logtime
from
vllm.logging_utils.log_time
import
logtime
...
@@ -8,6 +12,8 @@ from vllm.logging_utils.log_time import logtime
...
@@ -8,6 +12,8 @@ from vllm.logging_utils.log_time import logtime
__all__
=
[
__all__
=
[
"NewLineFormatter"
,
"NewLineFormatter"
,
"ColoredFormatter"
,
"ColoredFormatter"
,
"UvicornAccessLogFilter"
,
"create_uvicorn_log_config"
,
"lazy"
,
"lazy"
,
"logtime"
,
"logtime"
,
]
]
\ No newline at end of file
vllm/logging_utils/access_log_filter.py
0 → 100644
View file @
eefa41c1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Access log filter for uvicorn to exclude specific endpoints from logging.
This module provides a logging filter that can be used to suppress access logs
for specific endpoints (e.g., /health, /metrics) to reduce log noise in
production environments.
"""
import
logging
from
urllib.parse
import
urlparse
class
UvicornAccessLogFilter
(
logging
.
Filter
):
"""
A logging filter that excludes access logs for specified endpoint paths.
This filter is designed to work with uvicorn's access logger. It checks
the log record's arguments for the request path and filters out records
matching the excluded paths.
Uvicorn access log format:
'%s - "%s %s HTTP/%s" %d'
(client_addr, method, path, http_version, status_code)
Example:
127.0.0.1:12345 - "GET /health HTTP/1.1" 200
Args:
excluded_paths: A list of URL paths to exclude from logging.
Paths are matched exactly.
Example: ["/health", "/metrics"]
"""
def
__init__
(
self
,
excluded_paths
:
list
[
str
]
|
None
=
None
):
super
().
__init__
()
self
.
excluded_paths
=
set
(
excluded_paths
or
[])
def
filter
(
self
,
record
:
logging
.
LogRecord
)
->
bool
:
"""
Determine if the log record should be logged.
Args:
record: The log record to evaluate.
Returns:
True if the record should be logged, False otherwise.
"""
if
not
self
.
excluded_paths
:
return
True
# This filter is specific to uvicorn's access logs.
if
record
.
name
!=
"uvicorn.access"
:
return
True
# The path is the 3rd argument in the log record's args tuple.
# See uvicorn's access logging implementation for details.
log_args
=
record
.
args
if
isinstance
(
log_args
,
tuple
)
and
len
(
log_args
)
>=
3
:
path_with_query
=
log_args
[
2
]
# Get path component without query string.
if
isinstance
(
path_with_query
,
str
):
path
=
urlparse
(
path_with_query
).
path
if
path
in
self
.
excluded_paths
:
return
False
return
True
def
create_uvicorn_log_config
(
excluded_paths
:
list
[
str
]
|
None
=
None
,
log_level
:
str
=
"info"
,
)
->
dict
:
"""
Create a uvicorn logging configuration with access log filtering.
This function generates a logging configuration dictionary that can be
passed to uvicorn's `log_config` parameter. It sets up the access log
filter to exclude specified paths.
Args:
excluded_paths: List of URL paths to exclude from access logs.
log_level: The log level for uvicorn loggers.
Returns:
A dictionary containing the logging configuration.
Example:
>>> config = create_uvicorn_log_config(["/health", "/metrics"])
>>> uvicorn.run(app, log_config=config)
"""
config
=
{
"version"
:
1
,
"disable_existing_loggers"
:
False
,
"filters"
:
{
"access_log_filter"
:
{
"()"
:
UvicornAccessLogFilter
,
"excluded_paths"
:
excluded_paths
or
[],
},
},
"formatters"
:
{
"default"
:
{
"()"
:
"uvicorn.logging.DefaultFormatter"
,
"fmt"
:
"%(levelprefix)s %(message)s"
,
"use_colors"
:
None
,
},
"access"
:
{
"()"
:
"uvicorn.logging.AccessFormatter"
,
"fmt"
:
'%(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
,
# noqa: E501
},
},
"handlers"
:
{
"default"
:
{
"formatter"
:
"default"
,
"class"
:
"logging.StreamHandler"
,
"stream"
:
"ext://sys.stderr"
,
},
"access"
:
{
"formatter"
:
"access"
,
"class"
:
"logging.StreamHandler"
,
"stream"
:
"ext://sys.stdout"
,
"filters"
:
[
"access_log_filter"
],
},
},
"loggers"
:
{
"uvicorn"
:
{
"handlers"
:
[
"default"
],
"level"
:
log_level
.
upper
(),
"propagate"
:
False
,
},
"uvicorn.error"
:
{
"level"
:
log_level
.
upper
(),
"handlers"
:
[
"default"
],
"propagate"
:
False
,
},
"uvicorn.access"
:
{
"handlers"
:
[
"access"
],
"level"
:
log_level
.
upper
(),
"propagate"
:
False
,
},
},
}
return
config
\ No newline at end of file
vllm/lora/ops/triton_ops/fused_moe_lora_op.py
View file @
eefa41c1
...
@@ -175,6 +175,7 @@ def _fused_moe_lora_kernel(
...
@@ -175,6 +175,7 @@ def _fused_moe_lora_kernel(
top_k_num
,
top_k_num
,
lora_ids
,
lora_ids
,
adapter_enabled
,
adapter_enabled
,
max_loras
,
# <<< PR2: rename, used for masks when grid axis-2 != max_loras
# The stride variables represent how much to increase the ptr by when
# The stride variables represent how much to increase the ptr by when
# moving by 1 element in a particular dimension. E.g. `stride_am` is
# moving by 1 element in a particular dimension. E.g. `stride_am` is
# how much to increase `a_ptr` by to get the element one row down
# how much to increase `a_ptr` by to get the element one row down
...
@@ -225,7 +226,6 @@ def _fused_moe_lora_kernel(
...
@@ -225,7 +226,6 @@ def _fused_moe_lora_kernel(
):
):
pid
=
tl
.
program_id
(
axis
=
0
)
pid
=
tl
.
program_id
(
axis
=
0
)
slice_id
=
tl
.
program_id
(
axis
=
1
)
slice_id
=
tl
.
program_id
(
axis
=
1
)
grid_k
=
tl
.
cdiv
(
K
,
BLOCK_SIZE_K
*
SPLIT_K
)
grid_k
=
tl
.
cdiv
(
K
,
BLOCK_SIZE_K
*
SPLIT_K
)
# calculate pid_m,pid_n
# calculate pid_m,pid_n
...
@@ -296,7 +296,6 @@ def _fused_moe_lora_kernel(
...
@@ -296,7 +296,6 @@ def _fused_moe_lora_kernel(
cur_b_ptr
=
tl
.
load
(
b_ptr
+
slice_id
).
to
(
tl
.
pointer_type
(
c_ptr
.
dtype
.
element_ty
))
cur_b_ptr
=
tl
.
load
(
b_ptr
+
slice_id
).
to
(
tl
.
pointer_type
(
c_ptr
.
dtype
.
element_ty
))
cur_c_ptr
=
c_ptr
+
(
slice_id
%
num_slice_c
)
*
slice_c_size
cur_c_ptr
=
c_ptr
+
(
slice_id
%
num_slice_c
)
*
slice_c_size
offs_k
=
pid_sk
*
BLOCK_SIZE_K
+
tl
.
arange
(
0
,
BLOCK_SIZE_K
)
offs_k
=
pid_sk
*
BLOCK_SIZE_K
+
tl
.
arange
(
0
,
BLOCK_SIZE_K
)
token_mask
=
offs_token
<
num_valid_tokens
token_mask
=
offs_token
<
num_valid_tokens
...
@@ -504,6 +503,7 @@ def _fused_moe_lora_shrink(
...
@@ -504,6 +503,7 @@ def _fused_moe_lora_shrink(
top_k_num
,
top_k_num
,
lora_ids
,
lora_ids
,
adapter_enabled
,
adapter_enabled
,
lora_a_stacked
[
0
].
shape
[
0
],
qcurr_hidden_states
.
stride
(
0
),
qcurr_hidden_states
.
stride
(
0
),
qcurr_hidden_states
.
stride
(
1
),
qcurr_hidden_states
.
stride
(
1
),
w1_lora_a_stacked
.
stride
(
0
),
w1_lora_a_stacked
.
stride
(
0
),
...
@@ -639,6 +639,7 @@ def _fused_moe_lora_expand(
...
@@ -639,6 +639,7 @@ def _fused_moe_lora_expand(
top_k_num
,
top_k_num
,
lora_ids
,
lora_ids
,
adapter_enabled
,
adapter_enabled
,
lora_b_stacked
[
0
].
shape
[
0
],
a_intermediate_cache1
.
stride
(
0
),
a_intermediate_cache1
.
stride
(
0
),
a_intermediate_cache1
.
stride
(
1
),
a_intermediate_cache1
.
stride
(
1
),
w1_lora_b_stacked
.
stride
(
0
),
w1_lora_b_stacked
.
stride
(
0
),
...
...
vllm/model_executor/kernels/linear/__init__.py
View file @
eefa41c1
...
@@ -82,7 +82,6 @@ from vllm.model_executor.kernels.linear.scaled_mm.rocm import (
...
@@ -82,7 +82,6 @@ from vllm.model_executor.kernels.linear.scaled_mm.rocm import (
from
vllm.model_executor.kernels.linear.scaled_mm.triton
import
(
from
vllm.model_executor.kernels.linear.scaled_mm.triton
import
(
TritonInt8ScaledMMLinearKernel
,
TritonInt8ScaledMMLinearKernel
,
)
)
from
vllm.model_executor.kernels.linear.scaled_mm.xpu
import
(
from
vllm.model_executor.kernels.linear.scaled_mm.xpu
import
(
XPUFP8ScaledMMLinearKernel
,
XPUFP8ScaledMMLinearKernel
,
)
)
...
...
vllm/model_executor/kernels/linear/scaled_mm/rocm.py
View file @
eefa41c1
...
@@ -29,7 +29,6 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl(
...
@@ -29,7 +29,6 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl(
and
B
.
shape
[
0
]
%
16
==
0
# M TODO: needed?
and
B
.
shape
[
0
]
%
16
==
0
# M TODO: needed?
and
B
.
shape
[
1
]
%
16
==
0
# K
and
B
.
shape
[
1
]
%
16
==
0
# K
and
((
bias
is
None
)
or
(
bias
.
dtype
==
out_dtype
))
and
((
bias
is
None
)
or
(
bias
.
dtype
==
out_dtype
))
and
A
.
is_contiguous
()
):
):
output
=
ops
.
wvSplitKQ
(
output
=
ops
.
wvSplitKQ
(
B
.
t
(),
B
.
t
(),
...
...
vllm/model_executor/kernels/linear/xpu.py
deleted
100644 → 0
View file @
82155c76
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Sequence
import
torch
from
vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel
import
(
# noqa: E501
FP8ScaledMMLinearKernel
,
FP8ScaledMMLinearLayerConfig
,
)
from
vllm.platforms
import
current_platform
class
XPUFP8ScaledMMLinearKernel
(
FP8ScaledMMLinearKernel
):
@
classmethod
def
is_supported
(
cls
,
compute_capability
:
int
|
None
=
None
)
->
tuple
[
bool
,
str
|
None
]:
if
not
current_platform
.
is_xpu
():
return
False
,
"XPUFP8ScaledMM only support on XPU"
return
True
,
None
@
classmethod
def
can_implement
(
cls
,
c
:
FP8ScaledMMLinearLayerConfig
)
->
tuple
[
bool
,
str
|
None
]:
if
c
.
weight_quant_key
.
dtype
not
in
{
torch
.
float8_e5m2
,
torch
.
float8_e4m3fn
}:
return
False
,
"XPUFP8ScaledMM only support FP8 weight dtype"
return
True
,
None
def
__init__
(
self
,
c
:
FP8ScaledMMLinearLayerConfig
,
layer_param_names
:
Sequence
[
str
]
)
->
None
:
assert
self
.
can_implement
(
c
)[
0
]
assert
self
.
is_supported
()[
0
]
self
.
config
=
c
self
.
layer_param_names
=
layer_param_names
def
apply_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
weight
=
layer
.
weight
weight_scale
=
layer
.
weight_scale
return
torch
.
ops
.
_xpu_C
.
fp8_gemm_w8a16
(
x
,
weight
,
weight_scale
,
bias
)
def
apply_scaled_mm
(
self
,
*
,
A
:
torch
.
Tensor
,
B
:
torch
.
Tensor
,
out_dtype
:
torch
.
dtype
,
As
:
torch
.
Tensor
,
Bs
:
torch
.
Tensor
,
bias
:
torch
.
Tensor
|
None
,
output_shape
:
list
,
)
->
torch
.
Tensor
:
pass
vllm/model_executor/layers/attention/mla_attention.py
View file @
eefa41c1
...
@@ -2291,12 +2291,14 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
...
@@ -2291,12 +2291,14 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
self
,
prefill
:
MLACommonPrefillMetadata
,
chunk_idx
:
int
,
q
,
k
,
v
self
,
prefill
:
MLACommonPrefillMetadata
,
chunk_idx
:
int
,
q
,
k
,
v
):
):
assert
isinstance
(
prefill
,
FlashInferPrefillMetadata
)
assert
isinstance
(
prefill
,
FlashInferPrefillMetadata
)
attn_out
,
lse
=
prefill
.
prefill_chunks
[
chunk_idx
].
run
(
attn_out
,
lse
=
prefill
.
prefill_chunks
[
chunk_idx
].
run
(
q
=
q
,
q
=
q
,
k
=
k
,
k
=
k
,
v
=
v
,
v
=
v
,
return_lse
=
True
,
return_lse
=
True
,
)
)
# Convert from (q_len, num_heads) to (num_heads, q_len)
# Convert from (q_len, num_heads) to (num_heads, q_len)
return
attn_out
,
lse
.
transpose
(
0
,
1
).
contiguous
()
return
attn_out
,
lse
.
transpose
(
0
,
1
).
contiguous
()
...
...
vllm/model_executor/layers/fused_moe/all2all_utils.py
View file @
eefa41c1
...
@@ -9,6 +9,7 @@ from vllm.config import get_current_vllm_config
...
@@ -9,6 +9,7 @@ from vllm.config import get_current_vllm_config
from
vllm.distributed
import
(
from
vllm.distributed
import
(
get_ep_group
,
get_ep_group
,
)
)
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe.config
import
(
from
vllm.model_executor.layers.fused_moe.config
import
(
FusedMoEConfig
,
FusedMoEConfig
,
FusedMoEParallelConfig
,
FusedMoEParallelConfig
,
...
@@ -30,6 +31,8 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
...
@@ -30,6 +31,8 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.import_utils
import
has_deep_ep
,
has_mori
,
has_nixl_ep
from
vllm.utils.import_utils
import
has_deep_ep
,
has_mori
,
has_nixl_ep
logger
=
init_logger
(
__name__
)
if
current_platform
.
is_cuda_alike
():
if
current_platform
.
is_cuda_alike
():
if
has_deep_ep
():
if
has_deep_ep
():
from
.deepep_ht_prepare_finalize
import
DeepEPHTPrepareAndFinalize
from
.deepep_ht_prepare_finalize
import
DeepEPHTPrepareAndFinalize
...
@@ -90,6 +93,20 @@ def maybe_make_prepare_finalize(
...
@@ -90,6 +93,20 @@ def maybe_make_prepare_finalize(
allow_new_interface
:
bool
=
False
,
allow_new_interface
:
bool
=
False
,
use_monolithic
:
bool
=
False
,
use_monolithic
:
bool
=
False
,
)
->
FusedMoEPrepareAndFinalize
|
None
:
)
->
FusedMoEPrepareAndFinalize
|
None
:
# NOTE(rob): we are migrating each quant_method to hold the MK
# in all cases. The allow_new_interface=False flag allow us to fall
# back to the old method for methods that have not yet been migrated.
#
# In old method:
# * maybe_init_modular_kernel() calls this function. If we are
# using no Dp/Ep or naive all2all, we return None this function
# returns None and no ModularKernelMethod is created. If non-naive
# all2all is used, this returns a PrepareAndFinalize object and
# a ModularKernelMethod is created.
# In new method:
# * maybe_make_prepare_finalize() is called from the oracle. We
# always return a PrepareAndFinalize object and the quant method
# holds the ModularKernel.
if
not
moe
.
moe_parallel_config
.
use_all2all_kernels
:
if
not
moe
.
moe_parallel_config
.
use_all2all_kernels
:
if
not
allow_new_interface
:
if
not
allow_new_interface
:
return
None
return
None
...
@@ -115,11 +132,6 @@ def maybe_make_prepare_finalize(
...
@@ -115,11 +132,6 @@ def maybe_make_prepare_finalize(
prepare_finalize
:
FusedMoEPrepareAndFinalize
|
None
=
None
prepare_finalize
:
FusedMoEPrepareAndFinalize
|
None
=
None
# TODO(rob): update this as part of the MoE refactor.
assert
not
moe
.
use_flashinfer_cutlass_kernels
,
(
"Must be created in modelopt.py or fp8.py"
)
if
moe
.
use_deepep_ht_kernels
:
if
moe
.
use_deepep_ht_kernels
:
assert
moe
.
dp_size
==
all2all_manager
.
dp_world_size
assert
moe
.
dp_size
==
all2all_manager
.
dp_world_size
...
@@ -258,4 +270,5 @@ def maybe_make_prepare_finalize(
...
@@ -258,4 +270,5 @@ def maybe_make_prepare_finalize(
physical_to_global
=
physical_to_global
,
physical_to_global
=
physical_to_global
,
local_expert_global_ids
=
local_expert_global_ids
,
local_expert_global_ids
=
local_expert_global_ids
,
)
)
return
prepare_finalize
return
prepare_finalize
\ No newline at end of file
vllm/model_executor/layers/fused_moe/config.py
View file @
eefa41c1
...
@@ -17,7 +17,6 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
...
@@ -17,7 +17,6 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
)
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
GroupShape
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
GroupShape
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
from
vllm.utils.import_utils
import
has_triton_kernels
from
vllm.utils.import_utils
import
has_triton_kernels
from
vllm.utils.math_utils
import
cdiv
from
vllm.utils.math_utils
import
cdiv
...
...
vllm/model_executor/layers/fused_moe/cutlass_moe.py
View file @
eefa41c1
...
@@ -107,7 +107,13 @@ def run_cutlass_moe_fp8(
...
@@ -107,7 +107,13 @@ def run_cutlass_moe_fp8(
),
"Intermediate scale shape mismatch"
),
"Intermediate scale shape mismatch"
assert
out_dtype
in
[
torch
.
half
,
torch
.
bfloat16
],
"Invalid output dtype"
assert
out_dtype
in
[
torch
.
half
,
torch
.
bfloat16
],
"Invalid output dtype"
if
expert_map
is
not
None
:
# NOTE(rob): the expert_map is used for the STANDARD case and
# the batched format is used by the BATCHED case.
# TODO(rob): update the MK interface to only pass the expert_map
# during the STANDARD case to make this clearer across all kernels.
if
use_batched_format
:
assert
expert_num_tokens
is
not
None
else
:
assert
expert_num_tokens
is
None
assert
expert_num_tokens
is
None
# We have two modes: batched experts and non-batched experts.
# We have two modes: batched experts and non-batched experts.
...
...
vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
View file @
eefa41c1
...
@@ -103,6 +103,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -103,6 +103,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
num_experts
:
int
,
num_experts
:
int
,
a1_scale
:
torch
.
Tensor
|
None
,
a1_scale
:
torch
.
Tensor
|
None
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
,
)
->
Callable
:
)
->
Callable
:
has_scales
=
token_scales
is
not
None
has_scales
=
token_scales
is
not
None
...
@@ -174,6 +175,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -174,6 +175,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_topk_weights
,
expert_topk_weights
,
a1_scale
,
a1_scale
,
quant_config
,
quant_config
,
defer_input_quant
=
defer_input_quant
,
)
)
def
_receiver
(
def
_receiver
(
...
@@ -187,6 +189,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -187,6 +189,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_topk_weights
:
torch
.
Tensor
|
None
,
expert_topk_weights
:
torch
.
Tensor
|
None
,
a1_scale
:
torch
.
Tensor
|
None
,
a1_scale
:
torch
.
Tensor
|
None
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
if
event
.
event
is
not
None
:
if
event
.
event
is
not
None
:
event
.
current_stream_wait
()
event
.
current_stream_wait
()
...
@@ -221,14 +224,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -221,14 +224,15 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_num_tokens_per_expert_list
,
device
=
expert_x
.
device
expert_num_tokens_per_expert_list
,
device
=
expert_x
.
device
)
)
#
D
ispatch and
Q
uant
#
* For non-block quant, d
ispatch
in b16
and
q
uant
ize now as
# DeepEP kernels only support dispatching block
-quantized
#
DeepEP kernels only support dispatching block
scales.
#
activation scales.
#
* For expert kernels that require unquantized inputs,
#
Dispatch in bfloat16 and quantize afterwards
#
defer quantization to FusedMoEExpertsPermuteUnpermute.
if
not
quant_config
.
is_block_quantized
:
if
not
quant_config
.
is_block_quantized
and
not
defer_input_quant
:
# Quantize after dispatch.
# Quantize after dispatch.
expert_x_scale
=
None
expert_x_scale
=
None
if
expert_x
.
numel
()
!=
0
:
if
expert_x
.
numel
()
!=
0
:
# TODO: support per_act_token_quant,
expert_x
,
expert_x_scale
=
moe_kernel_quantize_input
(
expert_x
,
expert_x_scale
=
moe_kernel_quantize_input
(
expert_x
,
expert_x
,
a1_scale
,
a1_scale
,
...
@@ -258,6 +262,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -258,6 +262,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
ReceiverType
:
)
->
mk
.
ReceiverType
:
if
apply_router_weight_on_input
:
if
apply_router_weight_on_input
:
topk
=
topk_ids
.
size
(
1
)
topk
=
topk_ids
.
size
(
1
)
...
@@ -267,8 +272,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -267,8 +272,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
)
)
a1
=
a1
*
topk_weights
.
to
(
a1
.
dtype
)
a1
=
a1
*
topk_weights
.
to
(
a1
.
dtype
)
if
quant_config
.
is_block_quantized
:
# * DeepEP only supports fp8 block scales so quantize
# Quant and Dispatch
# before the dispatch for these models.
# * For all other quantization, dispatch after.
# * For expert kernels that require unquantized inputs,
# defer quantization to FusedMoEExpertsPermuteUnpermute.
if
quant_config
.
is_block_quantized
and
not
defer_input_quant
:
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
a1
,
quant_config
.
a1_scale
,
quant_config
.
a1_scale
,
...
@@ -282,7 +291,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -282,7 +291,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
else
:
else
:
a1q
=
a1
a1q
=
a1
a1q_scale
=
None
a1q_scale
=
None
a1_post_scale
=
quant_config
.
a1_scale
a1_post_scale
=
(
quant_config
.
a1_gscale
if
quant_config
.
quant_dtype
==
"nvfp4"
else
quant_config
.
a1_scale
)
return
self
.
_do_dispatch
(
return
self
.
_do_dispatch
(
tokens
=
a1q
,
tokens
=
a1q
,
...
@@ -292,6 +305,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -292,6 +305,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
num_experts
=
num_experts
,
num_experts
=
num_experts
,
a1_scale
=
a1_post_scale
,
a1_scale
=
a1_post_scale
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
defer_input_quant
=
defer_input_quant
,
)
)
def
prepare
(
def
prepare
(
...
@@ -303,6 +317,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -303,6 +317,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
receiver
=
self
.
prepare_async
(
receiver
=
self
.
prepare_async
(
a1
,
a1
,
...
@@ -312,6 +327,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -312,6 +327,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map
,
expert_map
,
apply_router_weight_on_input
,
apply_router_weight_on_input
,
quant_config
,
quant_config
,
defer_input_quant
,
)
)
return
receiver
()
return
receiver
()
...
...
vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
View file @
eefa41c1
...
@@ -241,7 +241,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -241,7 +241,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
tuple
[
Callable
,
mk
.
ReceiverType
]:
)
->
tuple
[
Callable
,
mk
.
ReceiverType
]:
if
defer_input_quant
:
raise
NotImplementedError
(
f
"
{
self
.
__class__
.
__name__
}
does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
hidden_size
=
a1
.
size
(
1
)
hidden_size
=
a1
.
size
(
1
)
assert
hidden_size
in
self
.
SUPPORTED_HIDDEN_SIZES
,
(
assert
hidden_size
in
self
.
SUPPORTED_HIDDEN_SIZES
,
(
...
@@ -344,7 +350,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -344,7 +350,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
if
defer_input_quant
:
raise
NotImplementedError
(
f
"
{
self
.
__class__
.
__name__
}
does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
hook
,
receiver
=
self
.
prepare_async
(
hook
,
receiver
=
self
.
prepare_async
(
a1
,
a1
,
topk_weights
,
topk_weights
,
...
...
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
View file @
eefa41c1
...
@@ -117,16 +117,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
...
@@ -117,16 +117,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
dtype
=
torch
.
float32
,
dtype
=
torch
.
float32
,
)
)
@
staticmethod
@
property
def
expects_unquantized_inputs
(
def
expects_unquantized_inputs
(
self
)
->
bool
:
moe_config
:
mk
.
FusedMoEConfig
,
quant_config
:
FusedMoEQuantConfig
return
self
.
quant_config
.
use_fp8_w8a8
and
self
.
quant_config
.
is_block_quantized
)
->
bool
:
# NVFP4 TP kernels and FP8 block-quantized kernels apply
# input quantization inside FusedMoEPermuteExpertsUnpermute.
return
(
quant_config
.
use_nvfp4_w4a4
and
not
moe_config
.
moe_parallel_config
.
use_all2all_kernels
)
or
(
quant_config
.
use_fp8_w8a8
and
quant_config
.
is_block_quantized
)
@
staticmethod
@
staticmethod
def
_supports_current_device
()
->
bool
:
def
_supports_current_device
()
->
bool
:
...
@@ -241,9 +234,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
...
@@ -241,9 +234,9 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
"""
"""
workspace1
=
(
M
,
K
)
workspace1
=
(
M
,
K
)
workspace2
=
(
0
,)
workspace2
=
(
0
,)
# For NVFP4, the output is stored in a packed int8 format,
#
For TP, the quantization is fused with fused_moe call
.
#
so the actual hidden dim is 2x the size of K here
.
output_shape
=
(
M
,
K
*
2
if
self
.
quant_dtype
==
"nvfp4"
and
self
.
use_dp
else
K
)
output_shape
=
(
M
,
K
*
2
if
self
.
quant_dtype
==
"nvfp4"
else
K
)
# The workspace is determined by `aq`, since it comes after any
# The workspace is determined by `aq`, since it comes after any
# potential communication op and is involved in the expert computation.
# potential communication op and is involved in the expert computation.
return
(
workspace1
,
workspace2
,
output_shape
)
return
(
workspace1
,
workspace2
,
output_shape
)
...
...
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
deleted
100644 → 0
View file @
82155c76
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.distributed
import
get_dp_group
,
get_ep_group
from
vllm.distributed.device_communicators.base_device_communicator
import
(
All2AllManagerBase
,
)
from
vllm.forward_context
import
get_forward_context
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.prepare_finalize
import
(
MoEPrepareAndFinalizeNoEP
,
)
from
vllm.model_executor.layers.fused_moe.topk_weight_and_reduce
import
(
TopKWeightAndReduceNoOP
,
)
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.utils.flashinfer
import
nvfp4_block_scale_interleave
def
get_local_sizes
():
return
get_forward_context
().
dp_metadata
.
get_chunk_sizes_across_dp_rank
()
class
FlashInferCutlassMoEPrepareAndFinalize
(
mk
.
FusedMoEPrepareAndFinalize
):
"""Base class for FlashInfer MoE prepare and finalize operations."""
def
__init__
(
self
,
use_dp
:
bool
,
num_dispatchers
:
int
=
1
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
):
super
().
__init__
()
self
.
num_dispatchers_
=
num_dispatchers
self
.
use_dp
=
use_dp
self
.
local_tokens
=
None
# Toggle for DeepSeek-style FP8 block-scale path where activations are
# not quantized here and weight block scales are consumed by the kernel.
self
.
use_deepseek_fp8_block_scale
=
use_deepseek_fp8_block_scale
@
property
def
activation_format
(
self
)
->
mk
.
FusedMoEActivationFormat
:
return
mk
.
FusedMoEActivationFormat
.
Standard
def
max_num_tokens_per_rank
(
self
)
->
int
|
None
:
return
None
def
topk_indices_dtype
(
self
)
->
torch
.
dtype
|
None
:
return
None
def
num_dispatchers
(
self
)
->
int
:
return
self
.
num_dispatchers_
def
output_is_reduced
(
self
)
->
bool
:
return
False
def
_apply_router_weight_on_input
(
self
,
a1
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
)
->
None
:
"""Apply router weight on input if needed."""
if
apply_router_weight_on_input
:
topk
=
topk_ids
.
size
(
1
)
assert
topk
==
1
,
(
"apply_router_weight_on_input is only implemented for topk=1"
)
a1
.
mul_
(
topk_weights
.
to
(
a1
.
dtype
))
class
FlashInferAllToAllMoEPrepareAndFinalize
(
FlashInferCutlassMoEPrepareAndFinalize
):
"""FlashInfer implementation using AllToAll communication."""
def
__init__
(
self
,
use_dp
:
bool
,
num_dispatchers
:
int
=
1
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
):
super
().
__init__
(
use_dp
,
num_dispatchers
,
use_deepseek_fp8_block_scale
)
self
.
alltoall_info
=
None
# Initialize all2all_manager only for DP case
self
.
all2all_manager
=
None
if
self
.
use_dp
:
self
.
all2all_manager
=
get_ep_group
().
device_communicator
.
all2all_manager
def
prepare
(
self
,
a1
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
num_experts
:
int
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
)
->
mk
.
PrepareResultType
:
self
.
_apply_router_weight_on_input
(
a1
,
topk_weights
,
topk_ids
,
apply_router_weight_on_input
)
if
not
self
.
use_dp
:
# Non-DP case: quantize activations unless using block-scale path
if
not
self
.
use_deepseek_fp8_block_scale
:
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
quant_config
.
a1_gscale
,
quant_config
.
quant_dtype
,
quant_config
.
per_act_token_quant
,
quant_config
.
block_shape
,
is_fp4_scale_swizzled
=
not
self
.
use_dp
,
)
else
:
a1q
=
a1
a1q_scale
=
None
else
:
# DP case: use FlashInfer AllToAll
global_num_tokens_cpu
=
get_local_sizes
()
top_k
=
topk_ids
.
size
(
1
)
(
self
.
alltoall_info
,
topk_ids
,
topk_weights
,
a1q
,
a1q_scale
)
=
(
flashinfer_alltoall_dispatch
(
self
.
all2all_manager
,
global_num_tokens_cpu
,
a1
,
quant_config
.
a1_gscale
,
topk_ids
,
topk_weights
,
top_k
,
num_experts
,
quant_config
,
use_deepseek_fp8_block_scale
=
self
.
use_deepseek_fp8_block_scale
,
)
)
return
a1q
,
a1q_scale
,
None
,
topk_ids
,
topk_weights
def
finalize
(
self
,
output
:
torch
.
Tensor
,
fused_expert_output
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
weight_and_reduce_impl
:
mk
.
TopKWeightAndReduce
,
)
->
None
:
if
self
.
use_dp
:
top_k
=
topk_ids
.
size
(
1
)
token_count
=
output
.
shape
[
0
]
fused_expert_output
=
flashinfer_alltoall_combine
(
self
.
all2all_manager
,
fused_expert_output
,
top_k
=
top_k
,
token_count
=
token_count
,
alltoall_info
=
self
.
alltoall_info
,
)
output
.
copy_
(
fused_expert_output
)
class
FlashInferAllGatherMoEPrepareAndFinalize
(
FlashInferCutlassMoEPrepareAndFinalize
):
def
__init__
(
self
,
use_dp
:
bool
,
num_dispatchers
:
int
=
1
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
):
super
().
__init__
(
use_dp
,
num_dispatchers
,
use_deepseek_fp8_block_scale
)
def
prepare
(
self
,
a1
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
num_experts
:
int
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
)
->
mk
.
PrepareResultType
:
self
.
_apply_router_weight_on_input
(
a1
,
topk_weights
,
topk_ids
,
apply_router_weight_on_input
)
is_nvfp4
=
quant_config
.
quant_dtype
==
"nvfp4"
if
not
self
.
use_dp
and
is_nvfp4
:
return
a1
,
None
,
None
,
topk_ids
,
topk_weights
if
not
self
.
use_deepseek_fp8_block_scale
:
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
quant_config
.
a1_gscale
if
is_nvfp4
else
quant_config
.
a1_scale
,
quant_config
.
quant_dtype
,
quant_config
.
per_act_token_quant
,
quant_config
.
block_shape
,
is_fp4_scale_swizzled
=
not
self
.
use_dp
,
)
else
:
# Block-scale path: pass activations through, omit per-token scales
a1q
=
a1
a1q_scale
=
None
if
self
.
use_dp
:
# Build gather list conditionally - omit a1q_scale if None
# (block-scale path)
gather_list
=
[
topk_weights
,
topk_ids
,
a1q
]
if
a1q_scale
is
not
None
:
gather_list
.
append
(
a1q_scale
)
gathered
=
get_dp_group
().
all_gatherv
(
gather_list
,
dim
=
0
,
sizes
=
get_local_sizes
(),
)
topk_weights
,
topk_ids
,
a1q
,
a1q_scale
=
gathered
else
:
gathered
=
get_dp_group
().
all_gatherv
(
gather_list
,
dim
=
0
,
sizes
=
get_local_sizes
(),
)
topk_weights
,
topk_ids
,
a1q
=
gathered
a1q_scale
=
None
if
is_nvfp4
and
a1q_scale
is
not
None
:
if
a1q_scale
.
element_size
()
==
1
:
a1q_scale
=
a1q_scale
.
view
(
torch
.
uint8
)
a1q_scale
=
nvfp4_block_scale_interleave
(
a1q_scale
)
return
a1q
,
a1q_scale
,
None
,
topk_ids
,
topk_weights
def
finalize
(
self
,
output
:
torch
.
Tensor
,
fused_expert_output
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
weight_and_reduce_impl
:
mk
.
TopKWeightAndReduce
,
)
->
None
:
assert
isinstance
(
weight_and_reduce_impl
,
TopKWeightAndReduceNoOP
)
if
self
.
use_dp
:
fused_expert_output
=
get_dp_group
().
reduce_scatterv
(
fused_expert_output
,
dim
=
0
,
sizes
=
get_local_sizes
()
)
output
.
copy_
(
fused_expert_output
)
def
flashinfer_alltoall_dispatch
(
all2all_manager
:
All2AllManagerBase
,
global_num_tokens_cpu
:
list
[
int
],
x
:
torch
.
Tensor
,
gs
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
top_k
:
int
,
num_experts
:
int
,
quant_config
:
FusedMoEQuantConfig
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
):
from
flashinfer.comm.trtllm_alltoall
import
MnnvlMoe
assert
all2all_manager
.
ensure_alltoall_workspace_initialized
(),
(
"FlashInfer AllToAll workspace not available"
)
ep_rank
=
all2all_manager
.
rank
ep_size
=
all2all_manager
.
world_size
max_num_token
=
(
max
(
global_num_tokens_cpu
)
if
global_num_tokens_cpu
is
not
None
else
x
.
shape
[
0
]
)
orig_topk_weights_dtype
=
topk_weights
.
dtype
alltoall_info
,
topk_ids
,
topk_weights
,
_
=
(
MnnvlMoe
.
mnnvl_moe_alltoallv_prepare_without_allgather
(
topk_ids
,
topk_weights
,
None
,
all2all_manager
.
prepare_workspace_tensor
,
max_num_token
,
ep_rank
,
ep_size
,
num_experts
,
num_experts
,
top_k
,
)
)
topk_weights
=
topk_weights
.
view
(
dtype
=
orig_topk_weights_dtype
)
if
not
use_deepseek_fp8_block_scale
:
x
,
x_sf
=
moe_kernel_quantize_input
(
x
,
gs
,
quant_config
.
quant_dtype
,
quant_config
.
per_act_token_quant
,
quant_config
.
block_shape
,
is_fp4_scale_swizzled
=
False
,
# delay swizzle to after comm
)
x
=
MnnvlMoe
.
mnnvl_moe_alltoallv
(
x
,
alltoall_info
,
all2all_manager
.
workspace_tensor
,
ep_rank
,
ep_size
,
)
x_sf
=
MnnvlMoe
.
mnnvl_moe_alltoallv
(
x_sf
,
alltoall_info
,
all2all_manager
.
workspace_tensor
,
ep_rank
,
ep_size
,
)
if
quant_config
.
quant_dtype
==
"nvfp4"
:
x_sf
=
nvfp4_block_scale_interleave
(
x_sf
)
else
:
# Block-scale path: pass activations through without quantization
x_sf
=
None
x
=
MnnvlMoe
.
mnnvl_moe_alltoallv
(
x
,
alltoall_info
,
all2all_manager
.
workspace_tensor
,
ep_rank
,
ep_size
,
)
return
alltoall_info
,
topk_ids
,
topk_weights
,
x
,
x_sf
def
flashinfer_alltoall_combine
(
all2all_manager
:
All2AllManagerBase
,
output
:
torch
.
Tensor
,
top_k
:
int
,
token_count
:
int
,
alltoall_info
,
):
from
flashinfer.comm.trtllm_alltoall
import
MnnvlMoe
assert
all2all_manager
.
ensure_alltoall_workspace_initialized
(),
(
"FlashInfer AllToAll workspace not available"
)
return
MnnvlMoe
.
mnnvl_moe_alltoallv_combine
(
output
,
alltoall_info
,
all2all_manager
.
workspace_tensor
,
ep_rank
=
all2all_manager
.
rank
,
ep_size
=
all2all_manager
.
world_size
,
top_k
=
top_k
,
token_count
=
token_count
,
)
def
create_flashinfer_prepare_finalize
(
use_dp
:
bool
,
use_nvfp4
:
bool
=
False
,
enable_alltoallv
:
bool
=
False
,
use_deepseek_fp8_block_scale
:
bool
=
False
,
)
->
FlashInferCutlassMoEPrepareAndFinalize
|
MoEPrepareAndFinalizeNoEP
:
"""Factory function to create the appropriate FlashInfer implementation."""
if
use_dp
:
if
enable_alltoallv
:
assert
use_nvfp4
return
FlashInferAllToAllMoEPrepareAndFinalize
(
use_dp
)
return
FlashInferAllGatherMoEPrepareAndFinalize
(
use_dp
=
True
,
use_deepseek_fp8_block_scale
=
use_deepseek_fp8_block_scale
,
)
else
:
# CUTLASS FP8 BLOCK and CUTLASS NVFP4 apply input quantization
# in a single call with the MoE experts kernel.
defer_input_quant
=
use_deepseek_fp8_block_scale
or
use_nvfp4
return
MoEPrepareAndFinalizeNoEP
(
defer_input_quant
=
defer_input_quant
)
\ No newline at end of file
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
View file @
eefa41c1
...
@@ -534,7 +534,13 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -534,7 +534,13 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
if
defer_input_quant
:
raise
NotImplementedError
(
f
"
{
self
.
__class__
.
__name__
}
does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
assert
a1
.
dim
()
==
2
assert
a1
.
dim
()
==
2
assert
topk_ids
.
dim
()
==
2
assert
topk_ids
.
dim
()
==
2
assert
topk_ids
.
size
(
0
)
==
a1
.
size
(
0
)
assert
topk_ids
.
size
(
0
)
==
a1
.
size
(
0
)
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
eefa41c1
...
@@ -164,7 +164,6 @@ def fused_moe_kernel_gptq_awq(
...
@@ -164,7 +164,6 @@ def fused_moe_kernel_gptq_awq(
group_size_m
=
min
(
num_pid_m
-
first_pid_m
,
GROUP_SIZE_M
)
group_size_m
=
min
(
num_pid_m
-
first_pid_m
,
GROUP_SIZE_M
)
pid_m
=
first_pid_m
+
((
pid
%
num_pid_in_group
)
%
group_size_m
)
pid_m
=
first_pid_m
+
((
pid
%
num_pid_in_group
)
%
group_size_m
)
pid_n
=
(
pid
%
num_pid_in_group
)
//
group_size_m
pid_n
=
(
pid
%
num_pid_in_group
)
//
group_size_m
# ----------------------------------------------------------
# ----------------------------------------------------------
# Create pointers for the first blocks of A and B.
# Create pointers for the first blocks of A and B.
# We will advance this pointer as we move in the K direction
# We will advance this pointer as we move in the K direction
...
...
vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
View file @
eefa41c1
...
@@ -5,6 +5,7 @@ from abc import abstractmethod
...
@@ -5,6 +5,7 @@ from abc import abstractmethod
import
torch
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe.config
import
(
from
vllm.model_executor.layers.fused_moe.config
import
(
FusedMoEConfig
,
FusedMoEConfig
,
...
...
Prev
1
2
3
4
5
6
7
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment