Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3bd8335b
Unverified
Commit
3bd8335b
authored
Dec 19, 2025
by
Wentao Ye
Committed by
GitHub
Dec 19, 2025
Browse files
[Refactor] Refactor for `DeepGemmQuantScaleFMT` using cache (#30898)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
1ab52135
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
29 additions
and
9 deletions
+29
-9
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
+2
-2
vllm/utils/deep_gemm.py
vllm/utils/deep_gemm.py
+27
-7
No files found.
vllm/model_executor/layers/quantization/utils/fp8_utils.py
View file @
3bd8335b
...
...
@@ -31,6 +31,7 @@ from vllm.model_executor.utils import replace_parameter
from
vllm.platforms
import
current_platform
from
vllm.triton_utils
import
tl
,
triton
from
vllm.utils.deep_gemm
import
(
DeepGemmQuantScaleFMT
,
fp8_gemm_nt
,
is_deep_gemm_e8m0_used
,
is_deep_gemm_supported
,
...
...
@@ -247,7 +248,6 @@ class W8A8BlockFp8LinearOp:
self
.
act_quant_group_shape
=
act_quant_group_shape
self
.
is_deep_gemm_supported
=
is_deep_gemm_supported
()
self
.
is_hopper
=
current_platform
.
is_device_capability
(
90
)
self
.
is_blackwell
=
current_platform
.
is_device_capability_family
(
100
)
self
.
use_deep_gemm_e8m0
=
is_deep_gemm_e8m0_used
()
# Get the correct blockscale mul and input quant operations.
...
...
@@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
weight
:
torch
.
Tensor
,
weight_scale
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
if
self
.
use_d
eep
_g
emm
_e8m0
and
self
.
is_blackwell
:
if
D
eep
G
emm
QuantScaleFMT
.
from_oracle
()
==
DeepGemmQuantScaleFMT
.
UE8M0
:
q_input
,
input_scale
=
per_token_group_quant_fp8_packed_for_deepgemm
(
input_2d
,
group_size
=
self
.
act_quant_group_shape
.
col
,
...
...
vllm/utils/deep_gemm.py
View file @
3bd8335b
...
...
@@ -32,16 +32,35 @@ class DeepGemmQuantScaleFMT(Enum):
# element contains 4 scale values.
UE8M0
=
2
@
staticmethod
def
from_oracle
()
->
"DeepGemmQuantScaleFMT"
:
if
not
is_deep_gemm_e8m0_used
():
return
DeepGemmQuantScaleFMT
.
FLOAT32
return
(
DeepGemmQuantScaleFMT
.
UE8M0
@
classmethod
def
init_oracle_cache
(
cls
)
->
None
:
"""Initialize the oracle decision and store it in the class cache"""
cached
=
getattr
(
cls
,
"_oracle_cache"
,
None
)
if
cached
is
not
None
:
return
use_e8m0
=
(
envs
.
VLLM_USE_DEEP_GEMM_E8M0
and
is_deep_gemm_supported
()
and
(
_fp8_gemm_nt_impl
is
not
None
)
)
if
not
use_e8m0
:
cls
.
_oracle_cache
=
cls
.
FLOAT32
# type: ignore
return
cls
.
_oracle_cache
=
(
# type: ignore
cls
.
UE8M0
if
current_platform
.
is_device_capability_family
(
100
)
else
DeepGemmQuantScaleFMT
.
FLOAT32_CEIL_UE8M0
else
cls
.
FLOAT32_CEIL_UE8M0
)
@
classmethod
def
from_oracle
(
cls
)
->
"DeepGemmQuantScaleFMT"
:
"""Return the pre-initialized oracle decision"""
cached
=
getattr
(
cls
,
"_oracle_cache"
,
None
)
assert
cached
is
not
None
,
"DeepGemmQuantScaleFMT oracle cache not initialized"
return
cached
@
functools
.
cache
def
is_deep_gemm_supported
()
->
bool
:
...
...
@@ -149,6 +168,7 @@ def _lazy_init() -> None:
_transform_sf_into_required_layout_impl
=
getattr
(
_dg
,
"transform_sf_into_required_layout"
,
None
)
DeepGemmQuantScaleFMT
.
init_oracle_cache
()
def
get_num_sms
()
->
int
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment