Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
394591e3
Unverified
Commit
394591e3
authored
Aug 22, 2025
by
Wentao Ye
Committed by
GitHub
Aug 21, 2025
Browse files
[Feature] Enable DeepGEMM Linear on B200; 1.5% E2E throughput improvement (#23351)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
3ac84966
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
13 additions
and
16 deletions
+13
-16
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
+6
-16
vllm/utils/deep_gemm.py
vllm/utils/deep_gemm.py
+7
-0
No files found.
vllm/model_executor/layers/quantization/utils/fp8_utils.py
View file @
394591e3
...
...
@@ -19,8 +19,9 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
CUTLASS_BLOCK_FP8_SUPPORTED
)
from
vllm.platforms
import
current_platform
from
vllm.triton_utils
import
tl
,
triton
from
vllm.utils
import
cdiv
,
direct_register_custom_op
,
has_deep_gemm
from
vllm.utils.deep_gemm
import
is_blackwell_deep_gemm_e8m0_used
from
vllm.utils
import
cdiv
,
direct_register_custom_op
from
vllm.utils.deep_gemm
import
(
is_blackwell_deep_gemm_e8m0_used
,
should_use_deepgemm_for_fp8_linear
)
logger
=
init_logger
(
__name__
)
...
...
@@ -108,19 +109,6 @@ def dispatch_w8a8_blockscale_func(
return
w8a8_block_fp8_matmul
def
should_use_deepgemm
(
output_dtype
:
torch
.
dtype
,
weight
:
torch
.
Tensor
):
"""
Check if DeepGEMM should be used based on the output dtype and weight shape.
DeepGEMM is only supported for bfloat16 output dtype and weights with shape
divisible by 128.
"""
return
(
current_platform
.
is_cuda
()
and
current_platform
.
is_device_capability
(
90
)
and
has_deep_gemm
()
and
envs
.
VLLM_USE_DEEP_GEMM
and
output_dtype
==
torch
.
bfloat16
and
weight
.
shape
[
0
]
%
128
==
0
and
weight
.
shape
[
1
]
%
128
==
0
)
# TODO fix ROCm->Triton custom path:
# https://github.com/vllm-project/vllm/issues/14397
def
apply_w8a8_block_fp8_linear
(
...
...
@@ -139,7 +127,7 @@ def apply_w8a8_block_fp8_linear(
output_shape
=
[
*
input
.
shape
[:
-
1
],
weight
.
shape
[
0
]]
output_dtype
=
input
.
dtype
if
should_use_deepgemm
(
output_dtype
,
weight
):
if
should_use_deepgemm
_for_fp8_linear
(
output_dtype
,
weight
):
input_2d
=
input
.
view
(
-
1
,
input
.
shape
[
-
1
])
output_shape
=
[
*
input
.
shape
[:
-
1
],
weight
.
shape
[
0
]]
...
...
@@ -150,7 +138,9 @@ def apply_w8a8_block_fp8_linear(
column_major_scales
=
True
,
)
# ensure DeepGEMM-backed custom op is registered before use
import
vllm.model_executor.layers.quantization.deepgemm
# noqa: F401
output
=
torch
.
ops
.
vllm
.
w8a8_block_fp8_matmul_deepgemm
(
q_input
,
weight
,
...
...
vllm/utils/deep_gemm.py
View file @
394591e3
...
...
@@ -202,6 +202,12 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor):
return
1
-
sim
def
should_use_deepgemm_for_fp8_linear
(
output_dtype
:
torch
.
dtype
,
weight
:
torch
.
Tensor
):
return
(
is_deep_gemm_supported
()
and
output_dtype
==
torch
.
bfloat16
and
weight
.
shape
[
0
]
%
128
==
0
and
weight
.
shape
[
1
]
%
128
==
0
)
__all__
=
[
"calc_diff"
,
"fp8_gemm_nt"
,
...
...
@@ -210,4 +216,5 @@ __all__ = [
"per_block_cast_to_fp8"
,
"is_blackwell_deep_gemm_e8m0_used"
,
"is_deep_gemm_supported"
,
"should_use_deepgemm_for_fp8_linear"
,
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment