Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c80a96da
Unverified
Commit
c80a96da
authored
Oct 11, 2025
by
Liu-congo
Committed by
GitHub
Oct 10, 2025
Browse files
[BugFix] test_mla_fp8.py fails on Cublas 12.9 (#11360)
Signed-off-by:
Liu-congo
<
1502632128@qq.com
>
parent
eae9a9fb
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
8 deletions
+31
-8
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+20
-8
python/sglang/srt/utils/common.py
python/sglang/srt/utils/common.py
+11
-0
No files found.
python/sglang/srt/models/deepseek_v2.py
View file @
c80a96da
...
...
@@ -94,6 +94,7 @@ from sglang.srt.layers.quantization.fp8_utils import (
block_quant_dequant
,
block_quant_to_tensor_quant
,
channel_quant_to_tensor_quant
,
input_to_float8
,
normalize_e4m3fn_to_e4m3fnuz
,
requant_weight_ue8m0_inplace
,
)
...
...
@@ -131,6 +132,7 @@ from sglang.srt.utils import (
is_hip
,
is_non_idle_and_non_empty
,
is_npu
,
is_nvidia_cublas_cu12_version_ge_12_9
,
is_sm100_supported
,
log_info_on_rank0
,
make_layers
,
...
...
@@ -189,6 +191,7 @@ else:
_is_flashinfer_available
=
is_flashinfer_available
()
_is_sm100_supported
=
is_cuda
()
and
is_sm100_supported
()
_is_cublas_ge_129
=
is_nvidia_cublas_cu12_version_ge_12_9
()
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -1572,9 +1575,14 @@ class DeepseekV2AttentionMLA(nn.Module):
self
.
w_kc
.
to
(
torch
.
bfloat16
)
*
self
.
w_scale
,
)
elif
self
.
w_kc
.
dtype
==
torch
.
float8_e4m3fn
:
# TODO fix the per_tensor_quant_mla_fp8 for cublas 12.9
if
_is_cublas_ge_129
:
q_nope_val
,
q_nope_scale
=
input_to_float8
(
q_nope
.
transpose
(
0
,
1
),
torch
.
float8_e4m3fn
)
else
:
q_nope_val
,
q_nope_scale
=
per_tensor_quant_mla_fp8
(
q_nope
.
transpose
(
0
,
1
),
zero_allocator
.
allocate
(
1
),
q_nope
.
transpose
(
0
,
1
),
zero_allocator
.
allocate
(
1
)
)
q_nope_out
=
bmm_fp8
(
q_nope_val
,
self
.
w_kc
,
q_nope_scale
,
self
.
w_scale
,
torch
.
bfloat16
...
...
@@ -1716,9 +1724,13 @@ class DeepseekV2AttentionMLA(nn.Module):
attn_bmm_output
=
attn_bmm_output
.
transpose
(
0
,
1
).
flatten
(
1
,
2
)
elif
self
.
w_vc
.
dtype
==
torch
.
float8_e4m3fn
:
if
_is_cublas_ge_129
:
attn_output_val
,
attn_output_scale
=
input_to_float8
(
attn_output
.
transpose
(
0
,
1
),
torch
.
float8_e4m3fn
)
else
:
attn_output_val
,
attn_output_scale
=
per_tensor_quant_mla_fp8
(
attn_output
.
transpose
(
0
,
1
),
zero_allocator
.
allocate
(
1
),
attn_output
.
transpose
(
0
,
1
),
zero_allocator
.
allocate
(
1
)
)
attn_bmm_output
=
bmm_fp8
(
attn_output_val
,
...
...
python/sglang/srt/utils/common.py
View file @
c80a96da
...
...
@@ -263,6 +263,17 @@ def is_flashinfer_available():
return
importlib
.
util
.
find_spec
(
"flashinfer"
)
is
not
None
and
is_cuda
()
def
is_nvidia_cublas_cu12_version_ge_12_9
():
"""
temporary fix for issue #11272
"""
try
:
installed_version
=
version
(
"nvidia-cublas-cu12"
)
except
PackageNotFoundError
:
return
False
return
pkg_version
.
parse
(
installed_version
)
>=
pkg_version
.
parse
(
"12.9"
)
def
random_uuid
()
->
str
:
return
str
(
uuid
.
uuid4
().
hex
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment