Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9e23ad96
Unverified
Commit
9e23ad96
authored
Jul 22, 2025
by
Shu Wang
Committed by
GitHub
Jul 21, 2025
Browse files
Update fp4 quantize API (#21327)
Signed-off-by:
Shu Wang
<
shuw@nvidia.com
>
parent
e69a92a1
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
11 additions
and
11 deletions
+11
-11
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
...model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+5
-5
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
...r/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+2
-2
vllm/utils/flashinfer.py
vllm/utils/flashinfer.py
+4
-4
No files found.
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
View file @
9e23ad96
...
@@ -181,12 +181,12 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -181,12 +181,12 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
g2_alphas
,
g2_alphas
,
]
]
_
=
flashinfer_cutlass_fused_moe
(
_
=
flashinfer_cutlass_fused_moe
(
hidden_states
,
input
=
hidden_states
,
topk_ids
.
to
(
torch
.
int
),
token_selected_experts
=
topk_ids
.
to
(
torch
.
int
),
topk_weights
,
token_final_scales
=
topk_weights
,
# FlashInfer API requires weight to be long for nvfp4
# FlashInfer API requires weight to be long for nvfp4
w1
.
view
(
torch
.
long
),
fc1_expert_weights
=
w1
.
view
(
torch
.
long
),
w2
.
view
(
torch
.
long
),
fc2_expert_weights
=
w2
.
view
(
torch
.
long
),
output_dtype
=
out_dtype
,
output_dtype
=
out_dtype
,
quant_scales
=
quant_scales
,
quant_scales
=
quant_scales
,
input_sf
=
a1q_scale
,
input_sf
=
a1q_scale
,
...
...
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
View file @
9e23ad96
...
@@ -11,7 +11,7 @@ from vllm.forward_context import get_forward_context
...
@@ -11,7 +11,7 @@ from vllm.forward_context import get_forward_context
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.utils
import
(
from
vllm.model_executor.layers.fused_moe.utils
import
(
extract_required_args
,
moe_kernel_quantize_input
)
extract_required_args
,
moe_kernel_quantize_input
)
from
vllm.utils.flashinfer
import
fp4_swizzle_
blockscale
from
vllm.utils.flashinfer
import
block
_
scale
_interleave
def
get_local_sizes
(
local_tokens
):
def
get_local_sizes
(
local_tokens
):
...
@@ -92,7 +92,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
...
@@ -92,7 +92,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
dim
=
0
,
dim
=
0
,
sizes
=
get_local_sizes
(
local_tokens
))
sizes
=
get_local_sizes
(
local_tokens
))
a1_m
,
a1_n
=
a1q
.
shape
a1_m
,
a1_n
=
a1q
.
shape
a1q_scale
=
fp4_swizzle_
blockscale
(
a1q_scale
,
a1_m
,
a1_n
*
2
)
a1q_scale
=
block
_
scale
_interleave
(
a1q_scale
)
return
a1q
,
a1q_scale
,
None
,
topk_ids
,
topk_weights
return
a1q
,
a1q_scale
,
None
,
topk_ids
,
topk_weights
...
...
vllm/utils/flashinfer.py
View file @
9e23ad96
...
@@ -69,8 +69,8 @@ flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper(
...
@@ -69,8 +69,8 @@ flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper(
flashinfer_cutlass_fused_moe
=
_lazy_import_wrapper
(
"flashinfer.fused_moe"
,
flashinfer_cutlass_fused_moe
=
_lazy_import_wrapper
(
"flashinfer.fused_moe"
,
"cutlass_fused_moe"
)
"cutlass_fused_moe"
)
fp4_quantize
=
_lazy_import_wrapper
(
"flashinfer"
,
"fp4_quantize"
)
fp4_quantize
=
_lazy_import_wrapper
(
"flashinfer"
,
"fp4_quantize"
)
fp4_swizzle_
blockscale
=
_lazy_import_wrapper
(
"flashinfer"
,
block
_
scale
_interleave
=
_lazy_import_wrapper
(
"flashinfer"
,
"
fp4_swizzle_
blockscale"
)
"block
_
scale
_interleave
"
)
# Special case for autotune since it returns a context manager
# Special case for autotune since it returns a context manager
autotune
=
_lazy_import_wrapper
(
autotune
=
_lazy_import_wrapper
(
...
@@ -95,7 +95,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
...
@@ -95,7 +95,7 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
required_functions
=
[
required_functions
=
[
(
"flashinfer.fused_moe"
,
"cutlass_fused_moe"
),
(
"flashinfer.fused_moe"
,
"cutlass_fused_moe"
),
(
"flashinfer"
,
"fp4_quantize"
),
(
"flashinfer"
,
"fp4_quantize"
),
(
"flashinfer"
,
"
fp4_swizzle_
blockscale"
),
(
"flashinfer"
,
"block
_
scale
_interleave
"
),
]
]
for
module_name
,
attr_name
in
required_functions
:
for
module_name
,
attr_name
in
required_functions
:
...
@@ -110,7 +110,7 @@ __all__ = [
...
@@ -110,7 +110,7 @@ __all__ = [
"flashinfer_trtllm_fp8_block_scale_moe"
,
"flashinfer_trtllm_fp8_block_scale_moe"
,
"flashinfer_cutlass_fused_moe"
,
"flashinfer_cutlass_fused_moe"
,
"fp4_quantize"
,
"fp4_quantize"
,
"
fp4_swizzle_
blockscale"
,
"block
_
scale
_interleave
"
,
"autotune"
,
"autotune"
,
"has_flashinfer_moe"
,
"has_flashinfer_moe"
,
"has_flashinfer_cutlass_fused_moe"
,
"has_flashinfer_cutlass_fused_moe"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment