Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1b25f1fe
Unverified
Commit
1b25f1fe
authored
Jul 24, 2025
by
Shu Wang
Committed by
GitHub
Jul 24, 2025
Browse files
Update flashinfer CUTLASS MoE Kernel (#21408)
Signed-off-by:
Shu Wang.
<
shuw@nvidia.com
>
parent
e8cb0d04
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
8 deletions
+8
-8
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
...r/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+2
-2
vllm/model_executor/layers/quantization/modelopt.py
vllm/model_executor/layers/quantization/modelopt.py
+2
-2
vllm/utils/flashinfer.py
vllm/utils/flashinfer.py
+4
-4
No files found.
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
View file @
1b25f1fe
...
@@ -11,7 +11,7 @@ from vllm.forward_context import get_forward_context
...
@@ -11,7 +11,7 @@ from vllm.forward_context import get_forward_context
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.utils
import
(
from
vllm.model_executor.layers.fused_moe.utils
import
(
extract_required_args
,
moe_kernel_quantize_input
)
extract_required_args
,
moe_kernel_quantize_input
)
from
vllm.utils.flashinfer
import
block_scale_interleave
from
vllm.utils.flashinfer
import
nvfp4_
block_scale_interleave
def
get_local_sizes
(
local_tokens
):
def
get_local_sizes
(
local_tokens
):
...
@@ -92,7 +92,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -92,7 +92,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
dim
=
0
,
dim
=
0
,
sizes
=
get_local_sizes
(
local_tokens
))
sizes
=
get_local_sizes
(
local_tokens
))
a1_m
,
a1_n
=
a1q
.
shape
a1_m
,
a1_n
=
a1q
.
shape
a1q_scale
=
block_scale_interleave
(
a1q_scale
)
a1q_scale
=
nvfp4_
block_scale_interleave
(
a1q_scale
)
return
a1q
,
a1q_scale
,
None
,
topk_ids
,
topk_weights
return
a1q
,
a1q_scale
,
None
,
topk_ids
,
topk_weights
...
...
vllm/model_executor/layers/quantization/modelopt.py
View file @
1b25f1fe
...
@@ -1254,8 +1254,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
...
@@ -1254,8 +1254,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
x
,
layer
.
w13_weight
,
layer
.
w2_weight
),
(
x
,
layer
.
w13_weight
,
layer
.
w2_weight
),
(
"Flashinfer CUTLASS Fused MoE not applicable!"
)
"Flashinfer CUTLASS Fused MoE not applicable!"
)
a1_gscale
=
torch
.
min
(
layer
.
w13_input_scale_quant
)
a1_gscale
=
layer
.
w13_input_scale_quant
a2_gscale
=
torch
.
min
(
layer
.
w2_input_scale_quant
)
a2_gscale
=
layer
.
w2_input_scale_quant
extra_expert_args
=
{
extra_expert_args
=
{
'g1_alphas'
:
layer
.
g1_alphas
,
'g1_alphas'
:
layer
.
g1_alphas
,
'g2_alphas'
:
layer
.
g2_alphas
,
'g2_alphas'
:
layer
.
g2_alphas
,
...
...
vllm/utils/flashinfer.py
View file @
1b25f1fe
...
@@ -69,8 +69,8 @@ flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper(
...
@@ -69,8 +69,8 @@ flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper(
flashinfer_cutlass_fused_moe
=
_lazy_import_wrapper
(
"flashinfer.fused_moe"
,
flashinfer_cutlass_fused_moe
=
_lazy_import_wrapper
(
"flashinfer.fused_moe"
,
"cutlass_fused_moe"
)
"cutlass_fused_moe"
)
fp4_quantize
=
_lazy_import_wrapper
(
"flashinfer"
,
"fp4_quantize"
)
fp4_quantize
=
_lazy_import_wrapper
(
"flashinfer"
,
"fp4_quantize"
)
block_scale_interleave
=
_lazy_import_wrapper
(
"flashinfer"
,
nvfp4_
block_scale_interleave
=
_lazy_import_wrapper
(
"
block_scale_interleave"
)
"flashinfer"
,
"nvfp4_
block_scale_interleave"
)
# Special case for autotune since it returns a context manager
# Special case for autotune since it returns a context manager
autotune
=
_lazy_import_wrapper
(
autotune
=
_lazy_import_wrapper
(
...
@@ -95,7 +95,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
...
@@ -95,7 +95,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
required_functions
=
[
required_functions
=
[
(
"flashinfer.fused_moe"
,
"cutlass_fused_moe"
),
(
"flashinfer.fused_moe"
,
"cutlass_fused_moe"
),
(
"flashinfer"
,
"fp4_quantize"
),
(
"flashinfer"
,
"fp4_quantize"
),
(
"flashinfer"
,
"block_scale_interleave"
),
(
"flashinfer"
,
"
nvfp4_
block_scale_interleave"
),
]
]
for
module_name
,
attr_name
in
required_functions
:
for
module_name
,
attr_name
in
required_functions
:
...
@@ -110,7 +110,7 @@ __all__ = [
...
@@ -110,7 +110,7 @@ __all__ = [
"flashinfer_trtllm_fp8_block_scale_moe"
,
"flashinfer_trtllm_fp8_block_scale_moe"
,
"flashinfer_cutlass_fused_moe"
,
"flashinfer_cutlass_fused_moe"
,
"fp4_quantize"
,
"fp4_quantize"
,
"block_scale_interleave"
,
"
nvfp4_
block_scale_interleave"
,
"autotune"
,
"autotune"
,
"has_flashinfer_moe"
,
"has_flashinfer_moe"
,
"has_flashinfer_cutlass_fused_moe"
,
"has_flashinfer_cutlass_fused_moe"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment