Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
be0058bc
Unverified
Commit
be0058bc
authored
Oct 20, 2025
by
Liu-congo
Committed by
GitHub
Oct 19, 2025
Browse files
[BugFix] replace the input_to_float8 used in dsv2 (#11612)
Signed-off-by:
Liu-congo
<
1502632128@qq.com
>
parent
9e3be1fa
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
17 additions
and
18 deletions
+17
-18
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+17
-18
No files found.
python/sglang/srt/models/deepseek_v2.py
View file @
be0058bc
...
...
@@ -92,7 +92,6 @@ from sglang.srt.layers.quantization.fp8_utils import (
block_quant_dequant
,
block_quant_to_tensor_quant
,
channel_quant_to_tensor_quant
,
input_to_float8
,
normalize_e4m3fn_to_e4m3fnuz
,
quant_weight_ue8m0
,
requant_weight_ue8m0_inplace
,
...
...
@@ -1623,15 +1622,15 @@ class DeepseekV2AttentionMLA(nn.Module):
self
.
w_kc
.
to
(
torch
.
bfloat16
)
*
self
.
w_scale
,
)
elif
self
.
w_kc
.
dtype
==
torch
.
float8_e4m3fn
:
#
TODO fix the per_tensor_quant_mla_fp8 fo
r cublas
12.9
if
_is_cublas_ge_129
:
q_nope
_val
,
q_nope_scale
=
input_to_float8
(
q_nope
.
transpose
(
0
,
1
),
torch
.
float8_e4m3fn
)
else
:
q_nope_val
,
q_nope_scale
=
per_tensor_quant_mla_fp8
(
q_nope
.
transpose
(
0
,
1
),
zero_allocator
.
allocate
(
1
)
)
#
fix bmm_fp8 error unde
r cublas12.9
caused by bumpallocator, detail in pr#11612
q_nope_val
,
q_nope_scale
=
per_tensor_quant_mla_fp8
(
q_nope
.
transpose
(
0
,
1
),
(
torch
.
zeros
((
1
,),
dtype
=
torch
.
float32
,
device
=
q_nope
.
device
)
if
_is_cublas_ge_129
else
zero_allocator
.
allocate
(
1
)
)
,
)
q_nope_out
=
bmm_fp8
(
q_nope_val
,
self
.
w_kc
,
q_nope_scale
,
self
.
w_scale
,
torch
.
bfloat16
)
...
...
@@ -1772,14 +1771,14 @@ class DeepseekV2AttentionMLA(nn.Module):
attn_bmm_output
=
attn_bmm_output
.
transpose
(
0
,
1
).
flatten
(
1
,
2
)
elif
self
.
w_vc
.
dtype
==
torch
.
float8_e4m3fn
:
if
_is_cublas_ge_129
:
attn_output
_val
,
attn_output_scale
=
input_to_float8
(
attn_output
.
transpose
(
0
,
1
),
torch
.
float8_e4m3fn
)
else
:
attn_output_val
,
attn_output_scale
=
per_tensor_quant_mla_fp8
(
attn_output
.
transpose
(
0
,
1
),
zero_allocator
.
allocate
(
1
)
)
attn_output_val
,
attn_output_scale
=
per_tensor_quant_mla_fp8
(
attn_output
.
transpose
(
0
,
1
),
(
torch
.
zeros
((
1
,),
dtype
=
torch
.
float32
,
device
=
attn_output
.
device
)
if
_is_cublas_ge_129
else
zero_allocator
.
allocate
(
1
)
)
,
)
attn_bmm_output
=
bmm_fp8
(
attn_output_val
,
self
.
w_vc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment