Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
42135d68
Unverified
Commit
42135d68
authored
Jan 21, 2026
by
Robert Shaw
Committed by
GitHub
Jan 21, 2026
Browse files
[MoE Refactor] Oracle Select FP8+NVFP4 Kernels In Priority (#32414)
parent
e14467be
Changes
82
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
2 deletions
+6
-2
vllm/model_executor/warmup/deep_gemm_warmup.py
vllm/model_executor/warmup/deep_gemm_warmup.py
+4
-0
vllm/v1/attention/backends/flashinfer.py
vllm/v1/attention/backends/flashinfer.py
+2
-2
No files found.
vllm/model_executor/warmup/deep_gemm_warmup.py
View file @
42135d68
...
...
@@ -128,11 +128,15 @@ def _fp8_linear_may_use_deep_gemm(module: torch.nn.Module) -> bool:
"""
Return True if the input module/layer could be processed with DeepGEMM.
"""
# FIXME: this logic is brittle and incorrect - since we
# could use DeepGEMM with for than just Fp8LinearMethod
block_size
=
get_mk_alignment_for_contiguous_layout
()[
0
]
if
not
(
isinstance
(
module
,
LinearBase
)
and
isinstance
(
module
.
quant_method
,
Fp8LinearMethod
)
and
module
.
quant_method
.
block_quant
and
not
module
.
quant_method
.
use_marlin
):
return
False
...
...
vllm/v1/attention/backends/flashinfer.py
View file @
42135d68
...
...
@@ -29,7 +29,7 @@ from vllm.model_executor.layers.batch_invariant import (
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
QuantKey
,
kFp8StaticTensorSym
,
kNvfp4
Quant
,
kNvfp4
Dynamic
,
)
from
vllm.platforms
import
current_platform
from
vllm.platforms.interface
import
DeviceCapability
...
...
@@ -1184,7 +1184,7 @@ class FlashInferImpl(AttentionImpl):
return
(
self
.
support_trtllm_attn
and
self
.
kv_cache_dtype
.
startswith
(
"fp8"
)
and
quant_key
in
(
kFp8StaticTensorSym
,
kNvfp4
Quant
)
and
quant_key
in
(
kFp8StaticTensorSym
,
kNvfp4
Dynamic
)
)
# FlashInfer requires attention sinks to be float32
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment