Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
8da7ca78
Commit
8da7ca78
authored
Nov 26, 2025
by
renzhc
Browse files
添加w_scale=1的判断,消除两处elementwise数乘操作。
parent
263b5bde
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
22 additions
and
8 deletions
+22
-8
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+22
-8
No files found.
python/sglang/srt/models/deepseek_v2.py
View file @
8da7ca78
...
@@ -1618,11 +1618,18 @@ class DeepseekV2AttentionMLA(nn.Module):
...
@@ -1618,11 +1618,18 @@ class DeepseekV2AttentionMLA(nn.Module):
torch
.
bfloat16
,
torch
.
bfloat16
,
q_nope_out
,
q_nope_out
,
)
)
else
:
else
:
# TODO: 手写融合算子
q_nope_out
=
torch
.
bmm
(
_q_nope_safe
=
q_nope
.
to
(
torch
.
bfloat16
).
transpose
(
0
,
1
)
q_nope
.
to
(
torch
.
bfloat16
).
transpose
(
0
,
1
),
_w_kc_safe
=
self
.
w_kc
.
to
(
torch
.
bfloat16
)
self
.
w_kc
.
to
(
torch
.
bfloat16
)
*
self
.
w_scale
,
if
abs
(
self
.
w_scale
-
1
)
<
1e-6
:
q_nope_out
=
torch
.
bmm
(
_q_nope_safe
,
_w_kc_safe
)
else
:
q_nope_out
=
torch
.
bmm
(
_q_nope_safe
,
_w_kc_safe
*
self
.
w_scale
,
)
)
# q_nope_out = torch.bmm(
# q_nope.to(torch.bfloat16).transpose(0, 1),
# self.w_kc.to(torch.bfloat16) * self.w_scale,
# )
elif
self
.
w_kc
.
dtype
==
torch
.
float8_e4m3fn
:
elif
self
.
w_kc
.
dtype
==
torch
.
float8_e4m3fn
:
# fix bmm_fp8 error under cublas12.9 caused by bumpallocator, detail in pr#11612
# fix bmm_fp8 error under cublas12.9 caused by bumpallocator, detail in pr#11612
q_nope_val
,
q_nope_scale
=
per_tensor_quant_mla_fp8
(
q_nope_val
,
q_nope_scale
=
per_tensor_quant_mla_fp8
(
...
@@ -1763,11 +1770,18 @@ class DeepseekV2AttentionMLA(nn.Module):
...
@@ -1763,11 +1770,18 @@ class DeepseekV2AttentionMLA(nn.Module):
torch
.
bfloat16
,
torch
.
bfloat16
,
attn_bmm_output
,
attn_bmm_output
,
)
)
else
:
else
:
# TODO: 手写融合算子
attn_bmm_output
=
torch
.
bmm
(
_attn_output_safe
=
attn_output
.
to
(
torch
.
bfloat16
).
transpose
(
0
,
1
)
attn_output
.
to
(
torch
.
bfloat16
).
transpose
(
0
,
1
),
_w_vc_safe
=
self
.
w_vc
.
to
(
torch
.
bfloat16
)
self
.
w_vc
.
to
(
torch
.
bfloat16
)
*
self
.
w_scale
,
if
abs
(
self
.
w_scale
-
1
)
<
1e-6
:
attn_bmm_output
=
torch
.
bmm
(
_attn_output_safe
,
_w_vc_safe
)
else
:
attn_bmm_output
=
torch
.
bmm
(
_attn_output_safe
,
_w_vc_safe
*
self
.
w_scale
,
)
)
# attn_bmm_output = torch.bmm(
# attn_output.to(torch.bfloat16).transpose(0, 1),
# self.w_vc.to(torch.bfloat16) * self.w_scale,
# )
if
self
.
o_proj
.
weight
.
dtype
==
torch
.
uint8
:
if
self
.
o_proj
.
weight
.
dtype
==
torch
.
uint8
:
attn_bmm_output
=
attn_bmm_output
.
transpose
(
0
,
1
)
attn_bmm_output
=
attn_bmm_output
.
transpose
(
0
,
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment