Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f7dcce7a
Unverified
Commit
f7dcce7a
authored
Aug 11, 2025
by
Wentao Ye
Committed by
GitHub
Aug 11, 2025
Browse files
[Feature] Add `VLLM_USE_DEEP_GEMM_E8M0` Env to Control E8M0 Scale (#21968)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
8e13d9fe
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
65 additions
and
39 deletions
+65
-39
tests/kernels/moe/test_block_fp8.py
tests/kernels/moe/test_block_fp8.py
+3
-2
tests/kernels/moe/test_deepep_deepgemm_moe.py
tests/kernels/moe/test_deepep_deepgemm_moe.py
+3
-3
vllm/envs.py
vllm/envs.py
+5
-0
vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
.../model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+2
-2
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+3
-3
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+3
-3
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/fp8.py
+7
-12
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
+2
-4
vllm/utils/deep_gemm.py
vllm/utils/deep_gemm.py
+37
-10
No files found.
tests/kernels/moe/test_block_fp8.py
View file @
f7dcce7a
...
...
@@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_topk
,
modular_triton_fused_moe
)
from
vllm.platforms
import
current_platform
from
vllm.utils
import
has_deep_gemm
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_used
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_
e8m0_
used
dg_available
=
has_deep_gemm
()
...
...
@@ -224,7 +224,8 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
skipif
(
not
dg_available
,
reason
=
"DeepGemm kernels not available."
)
@
pytest
.
mark
.
skipif
(
is_blackwell_deep_gemm_used
(),
reason
=
"Not E8M0 scale MOE"
)
@
pytest
.
mark
.
skipif
(
is_blackwell_deep_gemm_e8m0_used
(),
reason
=
"Not E8M0 scale MOE"
)
@
torch
.
inference_mode
()
def
test_w8a8_block_fp8_deep_gemm_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
seed
,
monkeypatch
):
...
...
tests/kernels/moe/test_deepep_deepgemm_moe.py
View file @
f7dcce7a
...
...
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel
)
from
vllm.platforms
import
current_platform
from
vllm.utils
import
has_deep_ep
,
has_deep_gemm
from
vllm.utils.deep_gemm
import
(
is_blackwell_deep_gemm_used
,
from
vllm.utils.deep_gemm
import
(
is_blackwell_deep_gemm_
e8m0_
used
,
is_deep_gemm_supported
)
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
...
...
@@ -370,7 +370,7 @@ NUM_EXPERTS = [32]
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[(
2
,
1
)])
@
requires_deep_ep
@
requires_deep_gemm
@
pytest
.
mark
.
skipif
(
is_blackwell_deep_gemm_used
(),
@
pytest
.
mark
.
skipif
(
is_blackwell_deep_gemm_
e8m0_
used
(),
reason
=
"Skipping test for Blackwell DeepGEMM"
)
def
test_ht_deepep_deepgemm_moe
(
mnk
:
tuple
[
int
,
int
,
int
],
num_experts
:
int
,
topk
:
int
,
world_dp_size
:
tuple
[
int
,
int
]):
...
...
@@ -427,7 +427,7 @@ USE_FP8_DISPATCH = [False]
@
pytest
.
mark
.
parametrize
(
"world_dp_size"
,
[(
2
,
1
)])
@
requires_deep_ep
@
requires_deep_gemm
@
pytest
.
mark
.
skipif
(
is_blackwell_deep_gemm_used
(),
@
pytest
.
mark
.
skipif
(
is_blackwell_deep_gemm_
e8m0_
used
(),
reason
=
"Skipping test for Blackwell DeepGEMM"
)
def
test_ll_deepep_deepgemm_moe
(
mnk
:
tuple
[
int
,
int
,
int
],
...
...
vllm/envs.py
View file @
f7dcce7a
...
...
@@ -127,6 +127,7 @@ if TYPE_CHECKING:
VLLM_TPU_MOST_MODEL_LEN
:
Optional
[
int
]
=
None
VLLM_TPU_USING_PATHWAYS
:
bool
=
False
VLLM_USE_DEEP_GEMM
:
bool
=
False
VLLM_USE_DEEP_GEMM_E8M0
:
bool
=
True
VLLM_SKIP_DEEP_GEMM_WARMUP
:
bool
=
False
VLLM_USE_FLASHINFER_MOE_FP8
:
bool
=
False
VLLM_USE_FLASHINFER_MOE_FP4
:
bool
=
False
...
...
@@ -925,6 +926,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_DEEP_GEMM"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_DEEP_GEMM"
,
"0"
))),
# Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
# E8M0 is faster on B200 but may reduce accuracy.
"VLLM_USE_DEEP_GEMM_E8M0"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_DEEP_GEMM_E8M0"
,
"1"
))),
# DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
# JIT all the required kernels before model execution so there is no
# JIT'ing in the hot-path. However, this warmup increases the engine
...
...
vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
View file @
f7dcce7a
...
...
@@ -12,7 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
from
vllm.model_executor.layers.fused_moe.utils
import
_resize_cache
from
vllm.triton_utils
import
tl
,
triton
from
vllm.utils.deep_gemm
import
(
fp8_m_grouped_gemm_nt_masked
,
is_blackwell_deep_gemm_used
)
is_blackwell_deep_gemm_
e8m0_
used
)
logger
=
init_logger
(
__name__
)
...
...
@@ -176,7 +176,7 @@ def silu_mul_fp8_quant_deep_gemm(
eps
,
fp8_min
,
fp8_max
,
is_blackwell_deep_gemm_used
(),
is_blackwell_deep_gemm_
e8m0_
used
(),
BLOCK
=
group_size
,
NUM_STAGES
=
8
,
num_warps
=
1
,
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
f7dcce7a
...
...
@@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
from
vllm.platforms
import
current_platform
from
vllm.triton_utils
import
tl
,
triton
from
vllm.utils
import
direct_register_custom_op
,
is_torch_equal_or_newer
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_used
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_
e8m0_
used
from
.rocm_aiter_fused_moe
import
is_rocm_aiter_moe_enabled
...
...
@@ -1387,8 +1387,8 @@ def fused_experts(hidden_states: torch.Tensor,
# E8M0 scale, which means we requantize the weight and input to the specific
# scale. Fallen back to cutlass or triton for some cases would cause
# accuracy issue.
should_use_deep_gemm
=
is_blackwell_deep_gemm_
used
()
or
_valid_deep_gemm
(
hidden_states
,
w1
,
w2
)
should_use_deep_gemm
=
is_blackwell_deep_gemm_
e8m0_used
(
)
or
_valid_deep_gemm
(
hidden_states
,
w1
,
w2
)
if
(
allow_deep_gemm
and
use_fp8_w8a8
and
should_use_deep_gemm
):
assert
apply_router_weight_on_input
is
False
assert
is_act_and_mul
,
(
...
...
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
View file @
f7dcce7a
...
...
@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
DeepGemmExperts
,
_valid_deep_gemm
,
_valid_deep_gemm_shape
,
deep_gemm_block_shape
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
TritonExperts
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_used
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_
e8m0_
used
class
TritonOrDeepGemmExperts
(
mk
.
FusedMoEPermuteExpertsUnpermute
):
...
...
@@ -107,7 +107,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
# Note: the deep gemm workspaces are strictly larger than the triton
# workspaces so we can be pessimistic here and allocate for DeepGemm
# even if we fall back to triton later, e.g. if expert maps are set.
if
self
.
allow_deep_gemm
and
(
is_blackwell_deep_gemm_used
()
if
self
.
allow_deep_gemm
and
(
is_blackwell_deep_gemm_
e8m0_
used
()
or
_valid_deep_gemm_shape
(
M
,
N
,
K
)):
assert
self
.
deep_gemm_expert
is
not
None
return
self
.
deep_gemm_expert
.
workspace_shapes
(
...
...
@@ -133,7 +133,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
extra_expert_args
:
Optional
[
dict
[
str
,
Any
]]):
use_deep_gemm
=
(
self
.
allow_deep_gemm
and
(
_valid_deep_gemm
(
hidden_states
,
w1
,
w2
)
or
is_blackwell_deep_gemm_used
()))
or
is_blackwell_deep_gemm_
e8m0_
used
()))
experts
=
self
.
deep_gemm_expert
if
use_deep_gemm
else
self
.
triton_expert
assert
experts
is
not
None
...
...
vllm/model_executor/layers/quantization/fp8.py
View file @
f7dcce7a
...
...
@@ -45,7 +45,8 @@ from vllm.model_executor.utils import set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.scalar_type
import
scalar_types
from
vllm.utils
import
has_deep_gemm
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_used
from
vllm.utils.deep_gemm
import
(
is_blackwell_deep_gemm_e8m0_used
,
is_deep_gemm_supported
)
from
vllm.utils.flashinfer
import
has_flashinfer_moe
if
TYPE_CHECKING
:
...
...
@@ -415,10 +416,10 @@ class Fp8LinearMethod(LinearMethodBase):
# Activations not quantized for marlin.
del
layer
.
input_scale
# On B200,
DeepGemm only support E8M0 scale, which means
we need to
# On B200,
if E8M0 for DeepGemm is used,
we need to
# requantize the weight and input to the specific scale
# at the same time.
if
is_blackwell_deep_gemm_used
():
if
is_blackwell_deep_gemm_
e8m0_
used
():
assert
layer
.
weight_block_size
is
not
None
block_sz
=
tuple
(
layer
.
weight_block_size
)
requant_weight_ue8m0_inplace
(
...
...
@@ -505,15 +506,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
elif
not
self
.
block_quant
:
logger
.
warning_once
(
"Model is not block quantized. Not using "
"DeepGemm kernels"
)
elif
(
current_platform
.
is_cuda
()
and
current_platform
.
is_device_capability
(
90
)):
elif
(
is_deep_gemm_supported
()):
logger
.
info_once
(
"Using DeepGemm kernels for Fp8MoEMethod."
)
self
.
allow_deep_gemm
=
True
elif
(
current_platform
.
is_cuda
()
and
is_blackwell_deep_gemm_used
()):
logger
.
info_once
(
"Using DeepGemm SM100 kernels for "
"Fp8MoEMethod."
)
self
.
allow_deep_gemm
=
True
else
:
logger
.
warning_once
(
"DeepGemm not supported on the current platform."
)
...
...
@@ -725,7 +720,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
# DeepGemm scales need to be transposed and aligned. We try to do
# it ahead of time for performance reasons.
if
self
.
allow_deep_gemm
and
not
is_blackwell_deep_gemm_used
():
if
self
.
allow_deep_gemm
and
not
is_blackwell_deep_gemm_
e8m0_
used
():
# Lazy import to avoid CUDA initialization problems.
if
_is_col_major
(
layer
.
w13_weight_scale_inv
):
layer
.
w13_weight_scale_inv
=
\
...
...
@@ -851,7 +846,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
del
layer
.
w13_input_scale
del
layer
.
w2_input_scale
if
is_blackwell_deep_gemm_used
():
if
is_blackwell_deep_gemm_
e8m0_
used
():
assert
layer
.
weight_block_size
is
not
None
# Re-quantise the expert weights so their scales are UE8M0.
block_sz
=
tuple
(
layer
.
weight_block_size
)
...
...
vllm/model_executor/layers/quantization/utils/fp8_utils.py
View file @
f7dcce7a
...
...
@@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
from
vllm.platforms
import
current_platform
from
vllm.triton_utils
import
tl
,
triton
from
vllm.utils
import
cdiv
,
direct_register_custom_op
,
has_deep_gemm
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_used
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_
e8m0_
used
logger
=
init_logger
(
__name__
)
...
...
@@ -394,10 +394,8 @@ def per_token_group_quant_fp8(
tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
scaling factor.
"""
# TODO(wentao): refactor this
# use_ue8m0 should be a global flag that could be set by user
if
use_ue8m0
is
None
:
use_ue8m0
=
is_blackwell_deep_gemm_used
()
use_ue8m0
=
is_blackwell_deep_gemm_
e8m0_
used
()
dtype
=
current_platform
.
fp8_dtype
()
if
dtype
is
None
else
dtype
assert
(
x
.
shape
[
-
1
]
%
group_size
==
0
),
(
f
"the last dimension of `x`
{
x
.
shape
[
-
1
]
}
must be divisible "
...
...
vllm/utils/deep_gemm.py
View file @
f7dcce7a
...
...
@@ -31,19 +31,37 @@ def is_deep_gemm_supported() -> bool:
@
functools
.
cache
def
is_blackwell_deep_gemm_used
()
->
bool
:
"""Return ``True`` if vLLM is configured to use DeepGEMM
on a
Blackwell-class GPU.
def
is_blackwell_deep_gemm_
e8m0_
used
()
->
bool
:
"""Return ``True`` if vLLM is configured to use DeepGEMM
"
"E8M0 scale on a
Blackwell-class GPU.
"""
if
not
(
envs
.
VLLM_USE_DEEP_GEMM
and
has_deep_gemm
()):
if
not
(
envs
.
VLLM_USE_DEEP_GEMM
):
logger
.
debug_once
(
"DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM=0."
)
return
False
if
not
has_deep_gemm
():
logger
.
debug_once
(
"DeepGEMM E8M0 disabled: DeepGEMM backend missing."
)
return
False
if
not
envs
.
VLLM_USE_DEEP_GEMM_E8M0
:
logger
.
debug_once
(
"DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM_E8M0=0."
)
return
False
_lazy_init
()
if
_fp8_gemm_nt_impl
is
None
:
logger
.
debug_once
(
"DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found"
)
return
False
return
(
current_platform
.
is_cuda
()
and
current_platform
.
is_device_capability
(
100
))
enabled
=
(
current_platform
.
is_cuda
()
and
current_platform
.
has_device_capability
(
100
))
if
enabled
:
logger
.
debug_once
(
"DeepGEMM E8M0 enabled on Blackwell GPU."
)
else
:
logger
.
debug_once
(
"DeepGEMM E8M0 disabled: not running on Blackwell GPU."
)
return
enabled
def
_missing
(
*
_
:
Any
,
**
__
:
Any
)
->
NoReturn
:
...
...
@@ -109,21 +127,30 @@ def fp8_gemm_nt(*args, **kwargs):
_lazy_init
()
if
_fp8_gemm_nt_impl
is
None
:
return
_missing
(
*
args
,
**
kwargs
)
return
_fp8_gemm_nt_impl
(
*
args
,
**
kwargs
)
return
_fp8_gemm_nt_impl
(
*
args
,
disable_ue8m0_cast
=
not
is_blackwell_deep_gemm_e8m0_used
(),
**
kwargs
)
def
m_grouped_fp8_gemm_nt_contiguous
(
*
args
,
**
kwargs
):
_lazy_init
()
if
_grouped_impl
is
None
:
return
_missing
(
*
args
,
**
kwargs
)
return
_grouped_impl
(
*
args
,
**
kwargs
)
return
_grouped_impl
(
*
args
,
disable_ue8m0_cast
=
not
is_blackwell_deep_gemm_e8m0_used
(),
**
kwargs
)
def
fp8_m_grouped_gemm_nt_masked
(
*
args
,
**
kwargs
):
_lazy_init
()
if
_grouped_masked_impl
is
None
:
return
_missing
(
*
args
,
**
kwargs
)
return
_grouped_masked_impl
(
*
args
,
**
kwargs
)
return
_grouped_masked_impl
(
*
args
,
disable_ue8m0_cast
=
not
is_blackwell_deep_gemm_e8m0_used
(),
**
kwargs
)
def
_ceil_to_ue8m0
(
x
:
torch
.
Tensor
):
...
...
@@ -181,6 +208,6 @@ __all__ = [
"m_grouped_fp8_gemm_nt_contiguous"
,
"fp8_m_grouped_gemm_nt_masked"
,
"per_block_cast_to_fp8"
,
"is_blackwell_deep_gemm_used"
,
"is_blackwell_deep_gemm_
e8m0_
used"
,
"is_deep_gemm_supported"
,
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment